Spaces:
Paused
Paused
| import axios from 'axios'; | |
| import { htmlToText } from 'html-to-text'; | |
| import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; | |
| import { Document } from '@langchain/core/documents'; | |
| import pdfParse from 'pdf-parse'; | |
| import logger from './logger'; | |
| export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => { | |
| const splitter = new RecursiveCharacterTextSplitter(); | |
| let docs: Document[] = []; | |
| await Promise.all( | |
| links.map(async (link) => { | |
| link = | |
| link.startsWith('http://') || link.startsWith('https://') | |
| ? link | |
| : `https://${link}`; | |
| try { | |
| const res = await axios.get(link, { | |
| responseType: 'arraybuffer', | |
| }); | |
| const isPdf = res.headers['content-type'] === 'application/pdf'; | |
| if (isPdf) { | |
| const pdfText = await pdfParse(res.data); | |
| const parsedText = pdfText.text | |
| .replace(/(\r\n|\n|\r)/gm, ' ') | |
| .replace(/\s+/g, ' ') | |
| .trim(); | |
| const splittedText = await splitter.splitText(parsedText); | |
| const title = 'PDF Document'; | |
| const linkDocs = splittedText.map((text) => { | |
| return new Document({ | |
| pageContent: text, | |
| metadata: { | |
| title: title, | |
| url: link, | |
| }, | |
| }); | |
| }); | |
| docs.push(...linkDocs); | |
| return; | |
| } | |
| const parsedText = htmlToText(res.data.toString('utf8'), { | |
| selectors: [ | |
| { | |
| selector: 'a', | |
| options: { | |
| ignoreHref: true, | |
| }, | |
| }, | |
| ], | |
| }) | |
| .replace(/(\r\n|\n|\r)/gm, ' ') | |
| .replace(/\s+/g, ' ') | |
| .trim(); | |
| const splittedText = await splitter.splitText(parsedText); | |
| const title = res.data | |
| .toString('utf8') | |
| .match(/<title>(.*?)<\/title>/)?.[1]; | |
| const linkDocs = splittedText.map((text) => { | |
| return new Document({ | |
| pageContent: text, | |
| metadata: { | |
| title: title || link, | |
| url: link, | |
| }, | |
| }); | |
| }); | |
| docs.push(...linkDocs); | |
| } catch (err) { | |
| logger.error( | |
| `Error at generating documents from links: ${err.message}`, | |
| ); | |
| docs.push( | |
| new Document({ | |
| pageContent: `Failed to retrieve content from the link: ${err.message}`, | |
| metadata: { | |
| title: 'Failed to retrieve content', | |
| url: link, | |
| }, | |
| }), | |
| ); | |
| } | |
| }), | |
| ); | |
| return docs; | |
| }; | |