util/web.py: parse new GitLab JS dropdown links (#45764)

Co-authored-by: Harmen Stoppels <harmenstoppels@gmail.com>
This commit is contained in:
Wouter Deconinck 2024-08-17 02:02:03 -05:00 committed by GitHub
parent f0f9a16e4f
commit 553cc3b70a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 31 additions and 4 deletions

View File

@ -0,0 +1,10 @@
<html>
<head>
This is the root page.
</head>
<body>
This is a page with a Vue javascript drop down with links as used in GitLab.
<div class="js-source-code-dropdown" data-css-class="" data-download-artifacts="[]" data-download-links="[{&quot;text&quot;:&quot;tar.gz&quot;,&quot;path&quot;:&quot;/foo-5.0.0.tar.gz&quot;}]"></div>
</body>
</html>

View File

@ -37,6 +37,7 @@ def _create_url(relative_url):
page_4 = _create_url("4.html")
root_with_fragment = _create_url("index_with_fragment.html")
root_with_javascript = _create_url("index_with_javascript.html")
@pytest.mark.parametrize(
@ -148,6 +149,11 @@ def test_find_versions_of_archive_with_fragment():
assert Version("5.0.0") in versions
def test_find_versions_of_archive_with_javascript():
versions = spack.url.find_versions_of_archive(root_tarball, root_with_javascript, list_depth=0)
assert Version("5.0.0") in versions
def test_get_header():
headers = {"Content-type": "text/plain"}

View File

@ -7,6 +7,7 @@
import concurrent.futures
import email.message
import errno
import json
import os
import os.path
import re
@ -152,7 +153,8 @@ class HTMLParseError(Exception):
class LinkParser(HTMLParser):
"""This parser just takes an HTML page and strips out the hrefs on the
links. Good enough for a really simple spider."""
links, as well as some javascript tags used on GitLab servers.
Good enough for a really simple spider."""
def __init__(self):
super().__init__()
@ -160,9 +162,18 @@ def __init__(self):
def handle_starttag(self, tag, attrs):
if tag == "a":
for attr, val in attrs:
if attr == "href":
self.links.append(val)
self.links.extend(val for key, val in attrs if key == "href")
# GitLab uses a javascript function to place dropdown links:
# <div class="js-source-code-dropdown" ...
# data-download-links="[{"path":"/graphviz/graphviz/-/archive/12.0.0/graphviz-12.0.0.zip",...},...]"/>
if tag == "div" and ("class", "js-source-code-dropdown") in attrs:
try:
links_str = next(val for key, val in attrs if key == "data-download-links")
links = json.loads(links_str)
self.links.extend(x["path"] for x in links)
except Exception:
pass
class ExtractMetadataParser(HTMLParser):