| """HTML processing functions""" |
| from __future__ import annotations |
|
|
| from bs4 import BeautifulSoup |
| from requests.compat import urljoin |
|
|
|
|
| def extract_hyperlinks(soup: BeautifulSoup, base_url: str) -> list[tuple[str, str]]: |
| """Extract hyperlinks from a BeautifulSoup object |
| |
| Args: |
| soup (BeautifulSoup): The BeautifulSoup object |
| base_url (str): The base URL |
| |
| Returns: |
| List[Tuple[str, str]]: The extracted hyperlinks |
| """ |
| return [ |
| (link.text, urljoin(base_url, link["href"])) |
| for link in soup.find_all("a", href=True) |
| ] |
|
|
|
|
| def format_hyperlinks(hyperlinks: list[tuple[str, str]]) -> list[str]: |
| """Format hyperlinks to be displayed to the user |
| |
| Args: |
| hyperlinks (List[Tuple[str, str]]): The hyperlinks to format |
| |
| Returns: |
| List[str]: The formatted hyperlinks |
| """ |
| return [f"{link_text} ({link_url})" for link_text, link_url in hyperlinks] |
|
|