diff --git a/server/src/markdownify/get_llm_input_text.ts b/server/src/markdownify/get_llm_input_text.ts new file mode 100644 index 00000000..d3846373 --- /dev/null +++ b/server/src/markdownify/get_llm_input_text.ts @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: MIT + +import * as cheerio from 'cheerio'; +import { AnyAuthClient } from 'node_modules/google-auth-library/build/src'; +import { URL } from 'url'; + +export interface ProcessTextOptions { + htmlParser?: boolean; + keepImages?: boolean; + removeSvgImage?: boolean; + removeGifImage?: boolean; + removeImageTypes?: string[]; + keepWebpageLinks?: boolean; + removeScriptTag?: boolean; + removeStyleTag?: boolean; + removeTags?: string[]; +} + +export async function getProcessedText( + pageSource: string, + baseUrl: string, + options: ProcessTextOptions = {} +): Promise { + const { + keepImages = true, + removeSvgImage = true, + removeGifImage = true, + removeImageTypes = [], + keepWebpageLinks = true, + removeScriptTag = true, + removeStyleTag = true, + removeTags = [] + } = options; + + try { + const $ = cheerio.load(pageSource); + + // Remove tags + const tagsToRemove: string[] = []; + if (removeScriptTag) tagsToRemove.push('script'); + if (removeStyleTag) tagsToRemove.push('style'); + tagsToRemove.push(...removeTags); + + const uniqueTags = [...new Set(tagsToRemove)]; + uniqueTags.forEach(tag => { + $(tag).remove(); + }); + + // Process image links + const imageTypesToRemove: string[] = []; + if (removeSvgImage) imageTypesToRemove.push('.svg'); + if (removeGifImage) imageTypesToRemove.push('.gif'); + imageTypesToRemove.push(...removeImageTypes); + + const uniqueImageTypes = [...new Set(imageTypesToRemove)]; + + $('img').each((_: any, element: any) => { + try { + const $img = $(element); + if (!keepImages) { + $img.remove(); + } else { + const imageLink = $img.attr('src'); + let typeReplaced = false; + + if (imageLink) { + if (uniqueImageTypes.length > 0) { + for (const imageType of uniqueImageTypes) { + if (!typeReplaced && imageLink.includes(imageType)) { + $img.remove(); + typeReplaced = true; + break; + } + } + } + if (!typeReplaced) { + const absoluteUrl = new URL(imageLink, baseUrl).toString(); + $img.replaceWith('\n' + absoluteUrl + ' '); + } + } + } + } catch (error) { + console.error('Error while processing image link: ', error); + } + }); + + // Process website links + $('a[href]').each((_: any, element: any) => { + try { + const $link = $(element); + if (!keepWebpageLinks) { + $link.remove(); + } else { + const href = $link.attr('href'); + if (href) { + const absoluteUrl = new URL(href, baseUrl).toString(); + $link.replaceWith($link.text() + ': ' + absoluteUrl + ' '); + } + } + } catch (error) { + console.error('Error while processing webpage link: ', error); + } + }); + + // Get text content + let text: string; + const bodyContent = $('body'); + + if (bodyContent.length > 0) { + // For minification, we'll use a simple approach to clean up the HTML + const bodyHtml = bodyContent.html() || ''; + const minimizedBody = minifyHtml(bodyHtml); + text = htmlToText(minimizedBody); + } else { + text = $.text(); + } + + return text; + + } catch (error) { + console.error('Error while getting processed text: ', error); + return ''; + } +} + +// Simple HTML minification function +function minifyHtml(html: string): string { + return html + .replace(/\s+/g, ' ') + .replace(/>\s+<') + .trim(); +} + +// Convert HTML to text (simplified version of inscriptis functionality) +function htmlToText(html: string): string { + const $ = cheerio.load(html); + + // Remove elements that shouldn't contribute to text + $('script, style, noscript').remove(); + + // Get text content with basic formatting + let text = $('body').text() || $.text(); + + // Clean up the text + text = text + .replace(/\s+/g, ' ') + .replace(/\n\s*\n/g, '\n') + .trim(); + + return text; +} \ No newline at end of file