Spaces:

openbmb
/

UltraData-Math-L0-Parser

Sleeping

App Files Files Community

UltraData-Math-L0-Parser / ultradata_math_parser /parsers /base_parser.py

chuyue

init

a579dd2 about 2 months ago

raw

history blame contribute delete

42.9 kB

	# -- coding:utf-8 --

	import html
	import logging
	from collections import defaultdict
	from copy import deepcopy
	from urllib.parse import unquote, urljoin
	from lxml.etree import Comment, strip_elements
	from ultradata_math_parser.config import *
	from ultradata_math_parser.readability_plus import Document as DocumentPlus
	from ultradata_math_parser.utils import *


	class BaseParser:
	def __init__(self):
	self.drop_ids = []
	self.need_comment = False
	self.process_math = True
	self.preserve_math_containers = True
	self.include_tables = True
	self.include_images = False
	self.fallback_min_length = 250
	self.enable_wild_text_fallback = True
	self.enable_readability_fallback = True
	self._logger = logging.getLogger(__name__)

	def xp_1_5(self, tree: HtmlElement):
	drop_list = False
	xp_num = "others"
	result_body = Element("body")

	for idx, expr in enumerate(BODY_XPATH):
	try:
	subtree = tree.xpath(expr)[0]
	xp_num = str(idx + 1)
	except IndexError:
	continue

	subtree, drop_list = self.prune_unwanted_sections(subtree)

	if len(subtree) == 0:
	xp_num = "others"
	continue

	ptest = subtree.xpath(".//text()[not(ancestor::a)]")
	ptest_len = text_len("".join(ptest))
	all_text_len = text_len(
	"".join(tree.xpath("//p//text()[not(ancestor::a)]"))
	)
	if drop_list:
	if ptest_len <= 50:
	if all_text_len > 100:
	xp_num = "others"
	continue
	else:
	if ptest_len <= 20:
	if all_text_len > 100:
	xp_num = "others"
	continue
	result_body.append(subtree)
	return result_body, xp_num, drop_list

	return result_body, xp_num, drop_list

	def get_content_html(self, cleaned_tree_backup, xp_num="others", base_url=""):
	# readability_plus
	doc = DocumentPlus(
	cleaned_tree_backup,
	url=base_url,
	xp_num=xp_num,
	need_comment=self.need_comment,
	)
	body = doc.summary(html_partial=True)

	return body

	def _text_length_from_html(self, html_fragment):
	if not html_fragment:
	return 0
	# 使用 lxml.html.fromstring 解析后提取 text_content
	# 不再依赖 w3m
	try:
	tree = fromstring(html_fragment)
	text = tree.text_content()
	return len(text or "")
	except Exception:
	return 0

	def _is_content_sufficient(self, html_fragment):
	return self._text_length_from_html(html_fragment) >= self.fallback_min_length

	def _remove_tables_from_tree(self, tree: HtmlElement) -> HtmlElement:
	if self.include_tables:
	return tree
	for table in list(tree.xpath(".//table")):
	parent = table.getparent()
	if parent is not None:
	parent.remove(table)
	return tree

	def _strip_tables_from_html(self, html_fragment: str) -> str:
	if self.include_tables or not html_fragment:
	return html_fragment
	try:
	wrapper = fromstring(f"<div>{html_fragment}</div>")
	except Exception:
	return html_fragment
	self._remove_tables_from_tree(wrapper)
	return "".join(tostring(child, encoding=str) for child in wrapper)

	def _remove_images_from_tree(self, tree: HtmlElement) -> HtmlElement:
	for node in list(tree.xpath(".//img\|.//picture\|.//source")):
	# 在删除IMG之前，检查ALT是否包含LaTeX公式
	if node.tag == "img":
	alt = node.get("alt", "")
	src = node.get("src", "")

	if alt:
	# URL解码（处理 &space; 等编码）
	alt_decoded = unquote(alt.replace('&space;', ' ').replace('\', '\\'))

	# 检测ALT是否包含LaTeX特征
	is_latex = False
	# 1. 以$开头结尾
	if alt_decoded.strip().startswith('$') and len(alt_decoded.strip()) > 2:
	is_latex = True
	# 2. 以\[开头或\]结尾 (display math)
	elif alt_decoded.strip().startswith('\\[') or alt_decoded.strip().endswith('\\]'):
	is_latex = True
	# 3. 包含LaTeX命令 (\frac, \sum, \alpha等)
	elif re.search(r'\\[a-zA-Z]+', alt_decoded):
	is_latex = True
	# 4. 包含上下标
	elif re.search(r'\^\|_\{\|_[a-zA-Z0-9]', alt_decoded):
	is_latex = True
	# 5. src包含latex相关关键词（作为辅助判断）
	elif any(kw in src.lower() for kw in ['latex', 'codecogs', 'math', 'tex', 'equation']):
	if len(alt_decoded.strip()) > 1:
	is_latex = True

	if is_latex:
	# 创建span保存LaTeX公式
	new_span = Element("span")
	# 确保公式被正确包装
	if alt_decoded.strip().startswith('$') or alt_decoded.strip().startswith('\\['):
	new_span.text = alt_decoded
	else:
	new_span.text = wrap_math(alt_decoded)

	# 在img之前插入span
	parent = node.getparent()
	if parent is not None:
	node.addprevious(new_span)

	# 删除图片节点
	parent = node.getparent()
	if parent is not None:
	parent.remove(node)

	for html_map in list(tree.xpath(".//map")):
	parent = html_map.getparent()
	if parent is not None:
	parent.remove(html_map)
	return tree

	def _strip_images_from_html(self, html_fragment: str) -> str:
	if not html_fragment:
	return html_fragment
	try:
	wrapper = fromstring(f"<div>{html_fragment}</div>")
	except Exception:
	return html_fragment
	self._remove_images_from_tree(wrapper)
	return "".join(tostring(child, encoding=str) for child in wrapper)

	def recover_wild_text(self, tree, base_url="", aggressive=False):
	if tree is None:
	return None
	working_tree = deepcopy(tree)
	try:
	pruned_tree, _ = self.prune_unwanted_sections(working_tree)
	except Exception:
	pruned_tree = working_tree
	search_expr = ".//p\|.//pre\|.//code\|.//blockquote\|.//q\|.//quote"
	if self.include_tables:
	search_expr += "\|.//table"
	if aggressive:
	search_expr += "\|.//div\|.//section\|.//article\|.//li"
	try:
	nodes = pruned_tree.xpath(search_expr)
	except Exception:
	nodes = []
	if not nodes:
	return None
	container = Element("div")
	seen_texts = set()
	for node in nodes:
	try:
	text_value = trim(node.text_content())
	except Exception:
	text_value = None
	if not text_value:
	continue
	if text_len(text_value) < 10:
	continue
	if text_value in seen_texts:
	continue
	seen_texts.add(text_value)
	if node.tag == "table":
	if self.include_tables:
	container.append(deepcopy(node))
	continue
	else:
	paragraph = Element("p")
	paragraph.text = text_value
	container.append(paragraph)
	if len(container) == 0:
	return None
	return tostring(container, encoding=str)

	def readability_fallback(self, tree, base_url=""):
	if tree is None:
	return None
	try:
	doc = DocumentPlus(
	deepcopy(tree),
	url=base_url,
	xp_num="others",
	need_comment=self.need_comment,
	)
	return doc.summary(html_partial=True)
	except Exception:
	return None

	def apply_fallbacks(self, primary_html, base_url, normal_tree, raw_tree):
	if self._is_content_sufficient(primary_html):
	return primary_html, "primary"

	wild_html = None
	if self.enable_wild_text_fallback:
	wild_html = self.recover_wild_text(normal_tree, base_url)
	if self._is_content_sufficient(wild_html):
	return wild_html, "wild_text"

	readability_html = None
	if self.enable_readability_fallback:
	readability_html = self.readability_fallback(raw_tree, base_url)
	if self._is_content_sufficient(readability_html):
	return readability_html, "readability"

	for candidate, name in (
	(primary_html, "primary"),
	(wild_html, "wild_text"),
	(readability_html, "readability"),
	):
	if candidate:
	return candidate, name
	return "", "primary"

	def prune_unwanted_nodes(self, tree, nodelist, with_backup=False):
	if with_backup is True:
	old_len = len(tree.text_content())
	backup = deepcopy(tree)
	for expr in nodelist:
	for subtree in tree.xpath(expr):
	if self.preserve_math_containers and subtree.xpath(".//math"):
	continue

	# DISCARD_IMAGE_ELEMENTS 需要特殊判断
	if '"caption"' in expr and subtree.xpath(".//img"):
	continue
	# 有些出现hidden
	if "hidden" in expr:
	try:
	if re.findall(
	"overflow-x:\s*hidden", subtree.attrib["style"]
	) or re.findall(
	"overflow-y:\s*hidden", subtree.attrib["style"]
	):
	continue
	if re.findall(
	"overflow:\s*hidden", subtree.attrib["style"]
	) and re.findall("height:", subtree.attrib["style"]):
	height_px = re.findall(
	"height:\s*(\d+)", subtree.attrib["style"]
	)[0]
	if int(height_px) >= 800:
	continue
	except:
	pass

	if ancestor_node_check(subtree, ['code', 'pre']):
	continue
	self.remove_node(subtree)
	if with_backup is False:
	return tree
	# else:
	new_len = len(tree.text_content())
	if new_len > old_len / 7:
	return tree
	return backup

	def prune_html(self, tree):
	"""Delete selected empty elements"""
	for element in tree.xpath(".//*[not(node())]"):
	if element.tag in CUT_EMPTY_ELEMS:
	self.remove_node(element)
	return tree

	def remove_node(self, node: HtmlElement):
	parent = node.getparent()
	if text_strip(node.tail):
	previous = node.getprevious()
	if previous is None:
	if parent is not None:
	if text_strip(parent.text):
	parent.text = "".join([parent.text, node.tail])
	else:
	parent.text = node.tail
	else:
	if text_strip(previous.tail):
	previous.tail = "".join([previous.tail, node.tail])
	else:
	previous.tail = node.tail

	if parent is not None:
	idx = node.attrib.get(Unique_ID, "")
	parent.remove(node)
	if idx:
	self.drop_ids.append(int(idx))

	def clean_tags(self, tree):
	strip_elements(tree, Comment)

	xp_lists = []
	if not self.need_comment:
	xp_lists.append(REMOVE_COMMENTS_XPATH)
	xp_lists.append(CONTENT_EXTRACTOR_NOISE_XPATHS)
	for xp_list in xp_lists:
	tree = self.prune_unwanted_nodes(tree, xp_list)

	cleaning_list, stripping_list = (
	MANUALLY_CLEANED.copy(),
	MANUALLY_STRIPPED.copy(),
	)

	for elem in tree.xpath(".//figure[descendant::table]"):
	elem.tag = "div"

	for expression in cleaning_list + ["form"]:
	for element in tree.getiterator(expression):
	if self.preserve_math_containers and element.xpath('.//math'):
	continue
	# 针对form 标签特殊处理
	if element.tag == "form":
	ptest = element.xpath(".//text()[not(ancestor::a)]")
	if text_len("".join(ptest)) <= 60: # 50
	self.remove_node(element)
	else:
	self.remove_node(element)

	HTML_CLEANER.kill_tags, HTML_CLEANER.remove_tags = cleaning_list, stripping_list
	cleaned_tree = HTML_CLEANER.clean_html(self.prune_html(tree))

	return cleaned_tree

	def generate_unique_id(self, element):
	idx = 0
	for node in iter_node(element):
	l_tag = node.tag.lower()
	if l_tag not in ["html", "body"]:
	node.attrib[Unique_ID] = str(idx)
	idx += 1

	def clean_unique_id(self, raw_element, content_html):
	ids = re.findall(f' {Unique_ID}="(\d+)"', content_html)
	self.drop_ids = list(set(self.drop_ids))
	self.drop_ids.sort()
	skip_ids = [-1]
	for x in ids:
	if int(x) > int(skip_ids[-1]):
	skip_ids.append(int(x))
	drop_node = raw_element.xpath(
	f"//*[@{Unique_ID}='{x}']"
	)
	if drop_node:
	new_div = Element("div")
	for j in self.drop_ids:
	if int(j) > int(skip_ids[-1]):
	append_element = drop_node[0].xpath(
	f".//*[@{Unique_ID}='{j}']"
	)
	if append_element:
	skip_ids.append(j)
	if len(append_element[0]) > 0:
	skip_ids.extend(
	[
	int(pjid)
	for pjid in append_element[0].xpath(
	f".//*/@{Unique_ID}"
	)
	]
	)
	append_element[0].tail = None
	new_div.append(append_element[0])

	try:
	drop_node[0].addnext(new_div)
	parent = drop_node[0].getparent()
	if parent is not None:
	parent.remove(drop_node[0])
	except:
	pass

	content_html = re.sub(f' {Unique_ID}="\d+"', "", content_html)

	drop_html = re.sub(
	f' {Unique_ID}="\d+"',
	"",
	tostring(raw_element, encoding=str),
	)
	return content_html, drop_html

	def math_latex_processing(self, node):
	# 1. 文本中有\\begin{align} 或 \\begin{equation}
	if node.tag not in ["script", "style"] and text_strip(node.text):
	regex = r"\\begin{align}(.*?)\\end{align}"
	text = node.text
	matches = re.findall(regex, text, re.DOTALL)
	if matches:
	node.text = text.replace("\\begin{align}", "").replace(
	"\\end{align}", ""
	)

	if node.tag not in ["script", "style"] and text_strip(node.text):
	regex = r"\\begin{equation}(.*?)\\end{equation}"
	text = node.text
	matches = re.findall(regex, text, re.DOTALL)
	for match in matches:
	match = match.replace("\\begin{equation}", "")
	match = match.replace("\\end{equation}", "")
	wrapped_text = wrap_math(match, display=True)
	text = text.replace(match, wrapped_text)
	if matches:
	# Remove the \begin{equation} and \end{equation} tags
	text = text.replace("\\begin{equation}", "").replace(
	"\\end{equation}", ""
	)
	node.text = text

	if node.tag not in ["script", "style"] and text_strip(node.tail):
	regex = r"\\begin{align}(.*?)\\end{align}"
	text = node.tail
	matches = re.findall(regex, text, re.DOTALL)
	if matches:
	node.tail = text.replace("\\begin{align}", "").replace(
	"\\end{align}", ""
	)

	if node.tag not in ["script", "style"] and text_strip(node.tail):
	regex = r"\\begin{equation}(.*?)\\end{equation}"
	text = node.tail
	matches = re.findall(regex, text, re.DOTALL)
	for match in matches:
	match = match.replace("\\begin{equation}", "")
	match = match.replace("\\end{equation}", "")
	wrapped_text = wrap_math(match, display=True)
	text = text.replace(match, wrapped_text)
	if matches:
	# Remove the \begin{equation} and \end{equation} tags
	text = text.replace("\\begin{equation}", "").replace(
	"\\end{equation}", ""
	)
	node.tail = text

	node_class = node.get("class")

	parent = node.getparent()

	# 2. class 为 texerror 的标签
	# Find the text between {} (maximum length) and replace the texerror with that text

	# 3. img中的latex
	if node.tag == "img":
	if node_class:
	class_list = node_class.split(" ")
	if any(
	[img_class in class_list for img_class in latex_image_class_names]
	):
	alt = node.get("alt")
	if text_strip(alt):
	new_span = Element("span")
	wrapped_alt = wrap_math(alt)
	new_span.text = wrapped_alt
	node.addprevious(new_span)
	self.remove_node(node)
	src = node.get("src")
	if src:
	if "codecogs.com" in src:
	try:
	latex = src.split("?")[1:]
	latex = "?".join(
	latex
	) # In case there are multiple ? in the latex
	latex = unquote(latex)
	new_span = Element("span")
	wrapped_latex = wrap_math(latex)
	new_span.text = wrapped_latex
	node.addprevious(new_span)
	self.remove_node(node)
	except:
	pass
	if "latex.php" in src:
	try:
	# they usually have "alt='-i u_t + \Delta u = \|u\|^2 u'"
	alt = node.get("alt")
	if text_strip(alt):
	# Unescape the latex
	alt = unquote(alt)
	# Get the latex
	wrapped_alt = wrap_math(alt)
	new_span = Element("span")
	new_span.text = wrapped_alt
	node.addprevious(new_span)
	self.remove_node(node)
	except:
	pass
	if "/images/math/codecogs" in src:
	try:
	# they usually have "alt='-i u_t + \Delta u = \|u\|^2 u'"
	alt = node.get("alt")
	if text_strip(alt):
	# Unescape the latex
	alt = unquote(alt)
	# Get the latex
	wrapped_alt = wrap_math(alt)
	new_span = Element("span")
	new_span.text = wrapped_alt
	node.addprevious(new_span)
	self.remove_node(node)
	except:
	pass
	if "mimetex.cgi" in src:
	try:
	latex = src.split("?")[1:]
	latex = "?".join(
	latex
	) # In case there are multiple ? in the latex
	latex = unquote(latex)
	new_span = Element("span")
	wrapped_latex = wrap_math(latex)
	new_span.text = wrapped_latex
	node.addprevious(new_span)
	self.remove_node(node)
	except:
	pass
	if "mathtex.cgi" in src:
	try:
	latex = src.split("?")[1:]
	latex = "?".join(
	latex
	) # In case there are multiple ? in the latex
	latex = unquote(latex)
	new_span = Element("span")
	wrapped_latex = wrap_math(latex)
	new_span.text = wrapped_latex
	node.addprevious(new_span)
	self.remove_node(node)
	except:
	pass
	if node_class:
	if "x-ck12" in node_class:
	try:
	latex = node.get("alt")
	if text_strip(latex):
	latex = unquote(latex)
	new_span = Element("span")
	wrapped_latex = wrap_math(latex)
	new_span.text = wrapped_latex
	node.addprevious(new_span)
	except:
	pass

	# 4. class 为 math-container
	if node_class == "math-container":
	try:
	text = node.text
	if text_strip(text):
	new_span = Element("span")
	wrapped_math = wrap_math(text, display=True)
	new_span.text = wrapped_math
	if parent is not None:
	if text_strip(node.tail):
	new_span.tail = node.tail
	parent.replace(node, new_span)
	except:
	pass

	# 5. class 为 wp-katex-eq
	if node_class == "wp-katex-eq":
	try:
	text = node.text
	if text_strip(text):
	new_span = Element("span")
	display_attr = node.get("data-display")
	if display_attr is not None:
	display = display_attr == "true"
	else:
	display = False
	wrapped_math = wrap_math(text, display=display)
	new_span.text = wrapped_math
	if parent is not None:
	if text_strip(node.tail):
	new_span.tail = node.tail
	parent.replace(node, new_span)
	except:
	pass

	# 6. script[type="math/tex"]
	if node.tag == "script" and node.get("type") == "math/tex":
	try:
	text = node.text
	if text_strip(text):
	new_span = Element("span")
	wrapped_text = wrap_math(text)
	new_span.text = wrapped_text
	if parent is not None:
	if text_strip(node.tail):
	new_span.tail = node.tail
	parent.replace(node, new_span)
	except:
	pass

	# 7. script[type="math/asciimath"]
	if node.tag == "script" and node.get("type") == "math/asciimath":
	try:
	text = node.text
	if text_strip(text):
	new_span = Element("span")
	wrapped_asciimath = wrap_math(extract_asciimath(text))
	new_span.text = wrapped_asciimath
	if parent is not None:
	if text_strip(node.tail):
	new_span.tail = node.tail
	parent.replace(node, new_span)
	except:
	# Delete this script tag
	self.remove_node(node)

	# 8. class tex
	if node_class == "tex":
	try:
	# Check if they have data-expr attr
	expr = node.get("data-expr")
	if text_strip(expr):
	# Replace with a span
	new_span = Element("span")
	wrapped_expr = wrap_math(expr)
	new_span.text = wrapped_expr
	if parent is not None:
	if text_strip(node.tail):
	new_span.tail = node.tail
	parent.replace(node, new_span)
	except:
	pass

	# 9. span.katex
	if node.tag == "span" and node_class == "katex":
	# Find any spans with class "katex-html" and remove them
	katex_html_spans = node.xpath('.//span[@class="katex-html"]')
	for katex_html_span in katex_html_spans:
	self.remove_node(katex_html_span)

	# 10. Remove any .MathJax_Preview spans
	if node.tag == "span" and node_class == "MathJax_Preview":
	self.remove_node(node)

	if node.tag == "span" and node_class and "x-ck12-mathEditor" in node_class:
	try:
	expr = node.get("data-tex")
	if text_strip(expr):
	expr = unquote(expr).replace("\"", "").replace(""", "")
	# Replace with a span
	new_span = Element("span")
	wrapped_expr = wrap_math(expr)
	new_span.text = wrapped_expr
	if parent is not None:
	if text_strip(node.tail):
	new_span.tail = node.tail
	parent.replace(node, new_span)
	except:
	pass

	# 11. all math tags
	if node.tag == "math":
	annotation_tags = node.xpath('.//annotation[@encoding="application/x-tex"]')
	if len(annotation_tags) > 0:
	annotation_tag = annotation_tags[0]
	text = annotation_tag.text
	if text_strip(text):
	new_span = Element("span")
	wrapped_text = wrap_math(text)
	new_span.text = wrapped_text
	if parent is not None:
	if text_strip(node.tail):
	new_span.tail = node.tail
	parent.replace(node, new_span)
	style_value = parent.get("style")
	if style_value:
	normalized_style_value = (
	style_value.lower()
	.strip()
	.replace(" ", "")
	.replace(";", "")
	)
	if "display:none" in normalized_style_value:
	parent.style = ""
	elif text_strip(node.get("alttext")):
	# Get the alttext attribute
	alttext = node.get("alttext")
	if text_strip(alttext):
	new_span = Element("span")
	wrapped_alttext = wrap_math(alttext)
	new_span.text = wrapped_alttext
	if parent is not None:
	if text_strip(node.tail):
	new_span.tail = node.tail
	parent.replace(node, new_span)
	else:
	try:
	# Try translating to LaTeX
	tmp_node = deepcopy(node)
	tmp_node.tail = None
	mathml = tostring(tmp_node, encoding=str)
	# If this includes xmlns:mml, then we need to replace all
	# instances of mml: with nothing
	if "xmlns:mml" in mathml:
	mathml = mathml.replace("mml:", "")
	# replace xmlns:mml="..." with nothing
	mathml = re.sub(r'xmlns:mml=".*?"', "", mathml)
	# if 'xmlns=' in mathml:
	# mathml = re.sub(r"xmlns='.*?'", '', mathml)
	latex = mml_to_latex(mathml)
	# Make a new span tag
	new_span = Element("span")
	# Set the html of the new span tag to the text
	wrapped_latex = wrap_math(latex)
	new_span.text = wrapped_latex
	# Then, we need to replace the math tag with the new span tag
	if parent is not None:
	if text_strip(node.tail):
	new_span.tail = node.tail
	parent.replace(node, new_span)
	except:

	self.remove_node(node)

	if node.tag == "mathjax":
	try:
	# Get the inner text of the mathjax tag
	text = node.text
	if text_strip(text):
	text = html.unescape(text)
	# Use regex to find text wrapped in hashes
	matches = re.findall(r"#(.+?)#", text)
	# For each match, replace the match with the LaTeX
	for match in matches:
	try:
	latex = extract_asciimath(match)
	# Replace the match with the LaTeX
	text = text.replace(f"#{match}#", latex)
	except:

	pass
	# Create a new span tag
	new_span = Element("span")
	# Set the html of the new span tag to the text
	new_span.text = text
	# Then, we need to replace the mathjax tag with the new span tag
	if parent is not None:
	if text_strip(node.tail):
	new_span.tail = node.tail
	parent.replace(node, new_span)
	except:
	pass

	def convert_tags(self, element, base_url=""):
	USELESS_ATTR_LIST = USELESS_ATTR
	if not self.need_comment:
	USELESS_ATTR_LIST = USELESS_ATTR_LIST + ["comment"]
	for node in iter_node(element):

	if self.process_math:
	# 增加数学标签转换
	self.math_latex_processing(node)

	if "data-src" in node.attrib and "src" not in node.attrib:
	node.attrib["src"] = node.attrib["data-src"]
	if "src" in node.attrib and node.attrib["src"] and base_url:
	src_url = node.attrib["src"]
	absolute_url = urljoin(base_url, src_url)
	node.attrib["src"] = absolute_url

	if node.tag.lower() == "div" and not node.getchildren():
	node.tag = "p"

	class_name = node.get("class")
	if class_name:
	if class_name.lower() in USELESS_ATTR_LIST:
	self.remove_node(node)

	return element

	def delete_by_link_density(
	self, subtree, tagname, backtracking=False, favor_precision=False
	):
	need_del_par = []
	skip_par = []
	drop_list = False
	for descendant in subtree.iter(tagname):
	pparent = descendant.getparent()
	if pparent in need_del_par or pparent in skip_par:
	continue
	siblings = descendant.xpath(f"following-sibling::{tagname}")

	if 'list' in descendant.get("class", "") and len(descendant.xpath('./a')) >= 5:
	need_del_par.append(descendant)
	need_del_par.extend(siblings)
	continue

	nn = [descendant]
	nn.extend(siblings)
	txt_max_num = 0
	if len(siblings) + 1 >= 4:
	pass
	else:
	txt_max_dict = {
	"read": 0,
	"more": 0,
	"...": 0,
	"阅读": 0,
	"更多": 0,
	"详细": 0,
	"detail": 0,
	"article": 0,
	"blog": 0,
	"news": 0,
	}
	if tagname == "div" or tagname == "article" or tagname == "section":
	for j in nn:
	txt = "".join(j.xpath(".//text()")).strip()
	for x in [
	"read",
	"more",
	"...",
	"阅读",
	"更多",
	"详细",
	"detail",
	"article",
	"blog",
	"news",
	]:
	if txt.lower().endswith(x):
	txt_max_dict[x] += 1
	txt_num = max(txt_max_dict.values())
	if txt_max_num < txt_num:
	txt_max_num = txt_num
	if txt_max_num >= 3:
	break
	if txt_max_num >= 3:
	pass
	else:
	continue
	skip_par.append(pparent)
	a_num = 0
	for j in siblings:
	if j.xpath(".//a"):
	if tagname == "p":
	if density_of_a_text(j, pre=0.8):
	a_num += 1
	elif tagname in ["div", "section", "article"]:
	if density_of_a_text(j, pre=0.2):
	a_num += 1
	else:
	if self.need_comment:
	# 增加判断是否包含评论再决定是否删除
	break_flg = False
	for c_xpath in Forum_XPATH[:-1]:
	if j.xpath(c_xpath.replace(".//", "self::")):
	break_flg = True
	break
	if break_flg:
	continue
	if tagname == "li":
	if text_len("".join(j.xpath(".//text()[not(ancestor::a)]"))) > 50:
	continue
	a_num += 1

	if a_num < len(siblings):
	if a_num >= 15 and (
	tagname == "div" or tagname == "article" or tagname == "section"
	):
	pass
	else:
	continue

	similarity_with_siblings_nums = similarity_with_siblings(
	descendant, siblings
	)
	if tagname == "article" or tagname == "item": # or tagname == "section"
	similarity_with_siblings_nums = similarity_with_siblings_nums * 1.5
	# 列表有个很特殊的地方另一种情况就是 descendant和siblings 都包含title/h1 \| h2 标签
	if tagname == "div" or tagname == "article" or tagname == "section":
	title_max_num = 0
	for ll in [".//head[@rend='h2']", ".//head[@rend='h1']", "./article"]:
	title_num = 0
	for jj in nn:
	if jj.xpath(ll):
	title_num += 1
	if title_max_num < title_num:
	title_max_num = title_num
	if title_max_num >= 4:
	similarity_with_siblings_nums = similarity_with_siblings_nums * 1.5

	if txt_max_num >= 3:
	pass
	elif similarity_with_siblings_nums < 0.84:
	if len(siblings) >= 15 and (
	tagname == "div" or tagname == "article" or tagname == "section"
	):
	pass
	else:
	continue
	# 父div中包含多同级div 且div class post-时，删除其余节点，保留第一篇文章
	class_attr = descendant.get("class") if descendant.get("class") else ""
	if (
	re.findall("post-", class_attr, re.I)
	or re.findall("-post", class_attr, re.I)
	or re.findall("blog\|aricle", class_attr, re.I)
	):
	drop_list = True
	sk_flg = True
	for dl in siblings:
	if (
	text_len("".join(descendant.xpath(".//text()"))) * 2
	< text_len("".join(dl.xpath(".//text()")))
	and sk_flg
	):
	self.remove_node(descendant)
	sk_flg = False
	else:
	self.remove_node(dl)
	else:
	need_del_par.append(descendant)
	need_del_par.extend(siblings)
	for node in need_del_par:
	drop_list = True
	try:
	self.remove_node(node)
	except Exception as e:
	pass

	myelems, deletions = defaultdict(list), []

	if tagname == "div":
	for elem in subtree.iter(tagname):
	if density_of_a_text(elem, pre=0.8) and img_div_check(elem):
	deletions.append(elem)

	for elem in subtree.iter(tagname):
	elemtext = trim(elem.text_content())
	result, templist = link_density_test(elem, elemtext, favor_precision)
	if result is True and img_div_check(elem):
	# 保留table中的链接
	if tagname in ['ul', 'li', 'div', 'p'] and ancestor_node_check(elem, ['td']):
	continue
	deletions.append(elem)
	elif backtracking is True and len(templist) > 0: # if?
	myelems[elemtext].append(elem)
	if backtracking is True:
	if favor_precision is False:
	threshold = 100
	else:
	threshold = 200
	for text, elem in myelems.items():
	if 0 < len(text) < threshold and len(elem) >= 3:
	deletions.extend(elem)

	for elem in uniquify_list(deletions):
	try:
	if self.need_comment:
	# 增加判断是否包含评论再决定是否删除
	break_flg = False
	for c_xpath in Forum_XPATH[:-1]:
	if elem.xpath(c_xpath):
	break_flg = True
	break
	if break_flg:
	continue
	self.remove_node(elem)
	except AttributeError:
	pass
	return subtree, drop_list

	def prune_unwanted_sections(self, tree):
	tmp_OVERALL_DISCARD_XPATH = OVERALL_DISCARD_XPATH
	if self.need_comment:
	tmp_OVERALL_DISCARD_XPATH = tmp_OVERALL_DISCARD_XPATH[:-1]
	tree = self.prune_unwanted_nodes(
	tree, tmp_OVERALL_DISCARD_XPATH, with_backup=True
	)
	for xp_list in [
	PAYWALL_DISCARD_XPATH,
	TEASER_DISCARD_XPATH,
	DISCARD_IMAGE_ELEMENTS,
	]:
	tree = self.prune_unwanted_nodes(tree, xp_list)
	# remove elements by link density
	tree, drop_list_1 = self.delete_by_link_density(
	tree, "div", backtracking=True, favor_precision=False
	)
	tree, drop_list_1_1 = self.delete_by_link_density(
	tree, "article", backtracking=False, favor_precision=False
	)
	tree, drop_list_1_2 = self.delete_by_link_density(
	tree, "section", backtracking=False, favor_precision=False
	)
	tree, drop_list_2_1 = self.delete_by_link_density(
	tree, "ul", backtracking=False, favor_precision=False
	)
	tree, drop_list_2_2 = self.delete_by_link_density(
	tree, "li", backtracking=False, favor_precision=False
	)
	tree, drop_list_3_1 = self.delete_by_link_density(
	tree, "dl", backtracking=False, favor_precision=False
	)
	tree, drop_list_3_3 = self.delete_by_link_density(
	tree, "dt", backtracking=False, favor_precision=False
	)
	tree, drop_list_3_2 = self.delete_by_link_density(
	tree, "dd", backtracking=False, favor_precision=False
	)
	tree, drop_list_3 = self.delete_by_link_density(
	tree, "p", backtracking=False, favor_precision=False
	)

	return (
	tree,
	drop_list_1
	or drop_list_2_1
	or drop_list_2_2
	or drop_list_3
	or drop_list_1_1
	or drop_list_1_2
	or drop_list_3_1
	or drop_list_3_2
	or drop_list_3_3,
	)