| | import sys |
| | from time import sleep |
| | import trafilatura |
| | from trafilatura.meta import reset_caches |
| | from trafilatura.settings import DEFAULT_CONFIG |
| | import spacy |
| |
|
| |
|
| | nlp = spacy.load("en_core_web_lg") |
| |
|
| |
|
| | DEFAULT_CONFIG.MAX_FILE_SIZE = 50000 |
| | MIN_CHAR = 50 |
| | MAX_CHAR = 5000 |
| |
|
| |
|
| | def get_page(url): |
| | page = None |
| | for _ in range(3): |
| | try: |
| | page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG) |
| | assert page is not None |
| | print("Fetched " + url, file=sys.stderr) |
| | break |
| | except: |
| | sleep(3) |
| | return page |
| |
|
| |
|
| | def url2lines(url): |
| | page = get_page(url) |
| |
|
| | if page is None: |
| | return [] |
| |
|
| | lines = html2lines(page) |
| | return lines |
| |
|
| |
|
| | def line_correction(lines, max_size=100): |
| | out_lines = [] |
| | for line in lines: |
| | if len(line) < MIN_CHAR: |
| | continue |
| |
|
| | if len(line) > max_size: |
| | doc = nlp( |
| | line[:MAX_CHAR] |
| | ) |
| | stack = "" |
| | for sent in doc.sents: |
| | if len(stack) > 0: |
| | stack += " " |
| | stack += str(sent).strip() |
| | if len(stack) > max_size: |
| | out_lines.append(stack) |
| | stack = "" |
| |
|
| | if ( |
| | len(stack) > MIN_CHAR |
| | ): |
| | out_lines.append(stack) |
| | else: |
| | out_lines.append(line) |
| |
|
| | return out_lines |
| |
|
| |
|
| | def html2lines(page): |
| | out_lines = [] |
| |
|
| | if len(page.strip()) == 0 or page is None: |
| | return out_lines |
| |
|
| | text = trafilatura.extract(page, config=DEFAULT_CONFIG) |
| | reset_caches() |
| |
|
| | if text is None: |
| | return out_lines |
| |
|
| | return text.split( |
| | "\n" |
| | ) |
| |
|