Resolve <include-fragment> tags e.g. in github release pages (#36674)
				
					
				
			This aims to resolve #34164 by resolving the <include-fragment> tags that GitHub has started using for their release pages, see https://github.github.io/include-fragment-element/. This feels a bit hacky but intended as a starting point for discussion. After reading a page during spidering, it first parses for include-fragments, gets them all, and treats them all as separate pages. Then it looks for href links in both the page itself and the fragments. Co-authored-by: Alec Scott <alec@bcs.sh>
This commit is contained in:
		
							
								
								
									
										1
									
								
								lib/spack/spack/test/data/web/fragment.html
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								lib/spack/spack/test/data/web/fragment.html
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					<a href="foo-5.0.0.tar.gz">foo-5.0.0.tar.gz</a>
 | 
				
			||||||
							
								
								
									
										13
									
								
								lib/spack/spack/test/data/web/index_with_fragment.html
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								lib/spack/spack/test/data/web/index_with_fragment.html
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,13 @@
 | 
				
			|||||||
 | 
					<html>
 | 
				
			||||||
 | 
					  <head>
 | 
				
			||||||
 | 
					    This is the root page.
 | 
				
			||||||
 | 
					  </head>
 | 
				
			||||||
 | 
					  <body>
 | 
				
			||||||
 | 
					    This is a page with an include-fragment element.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    <script type="module" src="https://unpkg.com/@github/include-fragment-element@latest?module"></script>
 | 
				
			||||||
 | 
					    <include-fragment src="fragment.html">
 | 
				
			||||||
 | 
					      <p>Loading...</p>
 | 
				
			||||||
 | 
					    </include-fragment>
 | 
				
			||||||
 | 
					  </body>
 | 
				
			||||||
 | 
					</html>
 | 
				
			||||||
@@ -31,6 +31,8 @@ def _create_url(relative_url):
 | 
				
			|||||||
page_3 = _create_url("3.html")
 | 
					page_3 = _create_url("3.html")
 | 
				
			||||||
page_4 = _create_url("4.html")
 | 
					page_4 = _create_url("4.html")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					root_with_fragment = _create_url("index_with_fragment.html")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.skipif(sys.platform == "win32", reason="Not supported on Windows (yet)")
 | 
					@pytest.mark.skipif(sys.platform == "win32", reason="Not supported on Windows (yet)")
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
@@ -143,6 +145,14 @@ def test_find_exotic_versions_of_archive_3():
 | 
				
			|||||||
    assert ver("4.5-rc5") in versions
 | 
					    assert ver("4.5-rc5") in versions
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.skipif(sys.platform == "win32", reason="Not supported on Windows (yet)")
 | 
				
			||||||
 | 
					def test_find_versions_of_archive_with_fragment():
 | 
				
			||||||
 | 
					    versions = spack.util.web.find_versions_of_archive(
 | 
				
			||||||
 | 
					        root_tarball, root_with_fragment, list_depth=0
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    assert ver("5.0.0") in versions
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_get_header():
 | 
					def test_get_header():
 | 
				
			||||||
    headers = {"Content-type": "text/plain"}
 | 
					    headers = {"Content-type": "text/plain"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -75,7 +75,7 @@ class LinkParser(HTMLParser):
 | 
				
			|||||||
    links.  Good enough for a really simple spider."""
 | 
					    links.  Good enough for a really simple spider."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self):
 | 
					    def __init__(self):
 | 
				
			||||||
        HTMLParser.__init__(self)
 | 
					        super().__init__()
 | 
				
			||||||
        self.links = []
 | 
					        self.links = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def handle_starttag(self, tag, attrs):
 | 
					    def handle_starttag(self, tag, attrs):
 | 
				
			||||||
@@ -85,6 +85,21 @@ def handle_starttag(self, tag, attrs):
 | 
				
			|||||||
                    self.links.append(val)
 | 
					                    self.links.append(val)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class IncludeFragmentParser(HTMLParser):
 | 
				
			||||||
 | 
					    """This parser takes an HTML page and selects the include-fragments,
 | 
				
			||||||
 | 
					    used on GitHub, https://github.github.io/include-fragment-element."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self):
 | 
				
			||||||
 | 
					        super().__init__()
 | 
				
			||||||
 | 
					        self.links = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def handle_starttag(self, tag, attrs):
 | 
				
			||||||
 | 
					        if tag == "include-fragment":
 | 
				
			||||||
 | 
					            for attr, val in attrs:
 | 
				
			||||||
 | 
					                if attr == "src":
 | 
				
			||||||
 | 
					                    self.links.append(val)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def read_from_url(url, accept_content_type=None):
 | 
					def read_from_url(url, accept_content_type=None):
 | 
				
			||||||
    if isinstance(url, str):
 | 
					    if isinstance(url, str):
 | 
				
			||||||
        url = urllib.parse.urlparse(url)
 | 
					        url = urllib.parse.urlparse(url)
 | 
				
			||||||
@@ -550,9 +565,38 @@ def _spider(url, collect_nested):
 | 
				
			|||||||
            page = codecs.getreader("utf-8")(response).read()
 | 
					            page = codecs.getreader("utf-8")(response).read()
 | 
				
			||||||
            pages[response_url] = page
 | 
					            pages[response_url] = page
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # Parse out the links in the page
 | 
					            # Parse out the include-fragments in the page
 | 
				
			||||||
 | 
					            # https://github.github.io/include-fragment-element
 | 
				
			||||||
 | 
					            include_fragment_parser = IncludeFragmentParser()
 | 
				
			||||||
 | 
					            include_fragment_parser.feed(page)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            fragments = set()
 | 
				
			||||||
 | 
					            while include_fragment_parser.links:
 | 
				
			||||||
 | 
					                raw_link = include_fragment_parser.links.pop()
 | 
				
			||||||
 | 
					                abs_link = url_util.join(response_url, raw_link.strip(), resolve_href=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                try:
 | 
				
			||||||
 | 
					                    # This seems to be text/html, though text/fragment+html is also used
 | 
				
			||||||
 | 
					                    fragment_response_url, _, fragment_response = read_from_url(
 | 
				
			||||||
 | 
					                        abs_link, "text/html"
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
 | 
					                except Exception as e:
 | 
				
			||||||
 | 
					                    msg = f"Error reading fragment: {(type(e), str(e))}:{traceback.format_exc()}"
 | 
				
			||||||
 | 
					                    tty.debug(msg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                if not fragment_response_url or not fragment_response:
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                fragment = codecs.getreader("utf-8")(fragment_response).read()
 | 
				
			||||||
 | 
					                fragments.add(fragment)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                pages[fragment_response_url] = fragment
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Parse out the links in the page and all fragments
 | 
				
			||||||
            link_parser = LinkParser()
 | 
					            link_parser = LinkParser()
 | 
				
			||||||
            link_parser.feed(page)
 | 
					            link_parser.feed(page)
 | 
				
			||||||
 | 
					            for fragment in fragments:
 | 
				
			||||||
 | 
					                link_parser.feed(fragment)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            while link_parser.links:
 | 
					            while link_parser.links:
 | 
				
			||||||
                raw_link = link_parser.links.pop()
 | 
					                raw_link = link_parser.links.pop()
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user