util/web.py: parse new GitLab JS dropdown links (#45764)

Co-authored-by: Harmen Stoppels <harmenstoppels@gmail.com>
This commit is contained in:
Wouter Deconinck 2024-08-17 02:02:03 -05:00 committed by GitHub
parent f0f9a16e4f
commit 553cc3b70a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 31 additions and 4 deletions

View File

@ -0,0 +1,10 @@
<html>
<head>
This is the root page.
</head>
<body>
This is a page with a Vue javascript drop down with links as used in GitLab.
<div class="js-source-code-dropdown" data-css-class="" data-download-artifacts="[]" data-download-links="[{&quot;text&quot;:&quot;tar.gz&quot;,&quot;path&quot;:&quot;/foo-5.0.0.tar.gz&quot;}]"></div>
</body>
</html>

View File

@ -37,6 +37,7 @@ def _create_url(relative_url):
page_4 = _create_url("4.html") page_4 = _create_url("4.html")
root_with_fragment = _create_url("index_with_fragment.html") root_with_fragment = _create_url("index_with_fragment.html")
root_with_javascript = _create_url("index_with_javascript.html")
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -148,6 +149,11 @@ def test_find_versions_of_archive_with_fragment():
assert Version("5.0.0") in versions assert Version("5.0.0") in versions
def test_find_versions_of_archive_with_javascript():
versions = spack.url.find_versions_of_archive(root_tarball, root_with_javascript, list_depth=0)
assert Version("5.0.0") in versions
def test_get_header(): def test_get_header():
headers = {"Content-type": "text/plain"} headers = {"Content-type": "text/plain"}

View File

@ -7,6 +7,7 @@
import concurrent.futures import concurrent.futures
import email.message import email.message
import errno import errno
import json
import os import os
import os.path import os.path
import re import re
@ -152,7 +153,8 @@ class HTMLParseError(Exception):
class LinkParser(HTMLParser): class LinkParser(HTMLParser):
"""This parser just takes an HTML page and strips out the hrefs on the """This parser just takes an HTML page and strips out the hrefs on the
links. Good enough for a really simple spider.""" links, as well as some javascript tags used on GitLab servers.
Good enough for a really simple spider."""
def __init__(self): def __init__(self):
super().__init__() super().__init__()
@ -160,9 +162,18 @@ def __init__(self):
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
if tag == "a": if tag == "a":
for attr, val in attrs: self.links.extend(val for key, val in attrs if key == "href")
if attr == "href":
self.links.append(val) # GitLab uses a javascript function to place dropdown links:
# <div class="js-source-code-dropdown" ...
# data-download-links="[{"path":"/graphviz/graphviz/-/archive/12.0.0/graphviz-12.0.0.zip",...},...]"/>
if tag == "div" and ("class", "js-source-code-dropdown") in attrs:
try:
links_str = next(val for key, val in attrs if key == "data-download-links")
links = json.loads(links_str)
self.links.extend(x["path"] for x in links)
except Exception:
pass
class ExtractMetadataParser(HTMLParser): class ExtractMetadataParser(HTMLParser):