| | import { JSDOM, VirtualConsole } from "jsdom"; |
| |
|
| | function removeTags(node: Node) { |
| | if (node.hasChildNodes()) { |
| | node.childNodes.forEach((childNode) => { |
| | if (node.nodeName === "SCRIPT" || node.nodeName === "STYLE") { |
| | node.removeChild(childNode); |
| | } else { |
| | removeTags(childNode); |
| | } |
| | }); |
| | } |
| | } |
| | function naiveInnerText(node: Node): string { |
| | const Node = node; |
| | return [...node.childNodes] |
| | .map((childNode) => { |
| | switch (childNode.nodeType) { |
| | case Node.TEXT_NODE: |
| | return node.textContent; |
| | case Node.ELEMENT_NODE: |
| | return naiveInnerText(childNode); |
| | default: |
| | return ""; |
| | } |
| | }) |
| | .join("\n"); |
| | } |
| |
|
| | export async function parseWeb(url: string) { |
| | const abortController = new AbortController(); |
| | setTimeout(() => abortController.abort(), 10000); |
| | const htmlString = await fetch(url, { signal: abortController.signal }) |
| | .then((response) => response.text()) |
| | .catch((err) => console.log(err)); |
| |
|
| | const virtualConsole = new VirtualConsole(); |
| | virtualConsole.on("error", () => { |
| | |
| | }); |
| |
|
| | |
| | const dom = new JSDOM(htmlString ?? "", { |
| | virtualConsole, |
| | }); |
| |
|
| | const body = dom.window.document.querySelector("body"); |
| | if (!body) throw new Error("body of the webpage is null"); |
| |
|
| | removeTags(body); |
| |
|
| | |
| | const text = (naiveInnerText(body) ?? "").replace(/ {2}|\r\n|\n|\r/gm, ""); |
| |
|
| | return text; |
| | } |
| |
|