More consistent URL parsing when finding versions.

Versions found by wildcard URLs are different from versions found by
parse_version, etc.  The wildcards are constructed more haphazardly
than the very specific URL patterns in url.py, so they can get things
wrong.  e.g., for this URL:

    https://software.lanl.gov/MeshTools/trac/attachment/wiki/WikiStart/mstk-2.25rc1.tgz

We miss the 'rc' and only return 2.25r as the version if we ONLY use
URL wildcards.

Future: Maybe use the regexes from url.py to scrape web pages, and
then compare them for similarity with the original URL, instead of
trying to make a structured wildcard URL pattern?  This might yield
better results.
This commit is contained in:
Todd Gamblin 2015-12-22 16:54:41 -08:00
parent d1d23ec5e6
commit 2b89d9b1db
2 changed files with 10 additions and 4 deletions

View File

@ -1200,6 +1200,8 @@ def find_versions_of_archive(*archive_urls, **kwargs):
for aurl in archive_urls:
list_urls.add(spack.url.find_list_url(aurl))
print list_urls
# Grab some web pages to scrape.
page_map = {}
for lurl in list_urls:
@ -1224,9 +1226,13 @@ def find_versions_of_archive(*archive_urls, **kwargs):
for page_url, content in page_map.iteritems():
# extract versions from matches.
for regex in regexes:
versions.update(
(Version(m.group(1)), urljoin(page_url, m.group(0)))
for m in re.finditer(regex, content))
print regex
print
for m in re.finditer(regex, content):
url = urljoin(page_url, m.group(0))
ver = spack.url.parse_version(url)
versions[ver] = url
return versions

View File

@ -210,7 +210,7 @@ def parse_version_offset(path):
(r'-((\d+\.)*\d+)$', stem),
# e.g. foobar-4.5.1b, foobar4.5RC, foobar.v4.5.1b
(r'[-._]?v?((\d+\.)*\d+[-._]?([a-z]|rc|RC|tp|TP?)\d*)$', stem),
(r'[-._]?v?((\d+\.)*\d+[-._]?([a-z]|rc|RC|tp|TP)?\d*)$', stem),
# e.g. foobar-4.5.0-beta1, or foobar-4.50-beta
(r'-((\d+\.)*\d+-beta(\d+)?)$', stem),