util/web.py: parse new GitLab JS dropdown links (#45764)
Co-authored-by: Harmen Stoppels <harmenstoppels@gmail.com>
This commit is contained in:
parent
f0f9a16e4f
commit
553cc3b70a
10
lib/spack/spack/test/data/web/index_with_javascript.html
Normal file
10
lib/spack/spack/test/data/web/index_with_javascript.html
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
This is the root page.
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
This is a page with a Vue javascript drop down with links as used in GitLab.
|
||||||
|
|
||||||
|
<div class="js-source-code-dropdown" data-css-class="" data-download-artifacts="[]" data-download-links="[{"text":"tar.gz","path":"/foo-5.0.0.tar.gz"}]"></div>
|
||||||
|
</body>
|
||||||
|
</html>
|
@ -37,6 +37,7 @@ def _create_url(relative_url):
|
|||||||
page_4 = _create_url("4.html")
|
page_4 = _create_url("4.html")
|
||||||
|
|
||||||
root_with_fragment = _create_url("index_with_fragment.html")
|
root_with_fragment = _create_url("index_with_fragment.html")
|
||||||
|
root_with_javascript = _create_url("index_with_javascript.html")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -148,6 +149,11 @@ def test_find_versions_of_archive_with_fragment():
|
|||||||
assert Version("5.0.0") in versions
|
assert Version("5.0.0") in versions
|
||||||
|
|
||||||
|
|
||||||
|
def test_find_versions_of_archive_with_javascript():
|
||||||
|
versions = spack.url.find_versions_of_archive(root_tarball, root_with_javascript, list_depth=0)
|
||||||
|
assert Version("5.0.0") in versions
|
||||||
|
|
||||||
|
|
||||||
def test_get_header():
|
def test_get_header():
|
||||||
headers = {"Content-type": "text/plain"}
|
headers = {"Content-type": "text/plain"}
|
||||||
|
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import email.message
|
import email.message
|
||||||
import errno
|
import errno
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import os.path
|
import os.path
|
||||||
import re
|
import re
|
||||||
@ -152,7 +153,8 @@ class HTMLParseError(Exception):
|
|||||||
|
|
||||||
class LinkParser(HTMLParser):
|
class LinkParser(HTMLParser):
|
||||||
"""This parser just takes an HTML page and strips out the hrefs on the
|
"""This parser just takes an HTML page and strips out the hrefs on the
|
||||||
links. Good enough for a really simple spider."""
|
links, as well as some javascript tags used on GitLab servers.
|
||||||
|
Good enough for a really simple spider."""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@ -160,9 +162,18 @@ def __init__(self):
|
|||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
def handle_starttag(self, tag, attrs):
|
||||||
if tag == "a":
|
if tag == "a":
|
||||||
for attr, val in attrs:
|
self.links.extend(val for key, val in attrs if key == "href")
|
||||||
if attr == "href":
|
|
||||||
self.links.append(val)
|
# GitLab uses a javascript function to place dropdown links:
|
||||||
|
# <div class="js-source-code-dropdown" ...
|
||||||
|
# data-download-links="[{"path":"/graphviz/graphviz/-/archive/12.0.0/graphviz-12.0.0.zip",...},...]"/>
|
||||||
|
if tag == "div" and ("class", "js-source-code-dropdown") in attrs:
|
||||||
|
try:
|
||||||
|
links_str = next(val for key, val in attrs if key == "data-download-links")
|
||||||
|
links = json.loads(links_str)
|
||||||
|
self.links.extend(x["path"] for x in links)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ExtractMetadataParser(HTMLParser):
|
class ExtractMetadataParser(HTMLParser):
|
||||||
|
Loading…
Reference in New Issue
Block a user