diff --git a/server/src/markdownify/get_html.ts b/server/src/markdownify/get_html.ts deleted file mode 100644 index dbf6a8a9..00000000 --- a/server/src/markdownify/get_html.ts +++ /dev/null @@ -1,55 +0,0 @@ -import { chromium, Browser, BrowserContext, Page } from 'playwright'; - -export interface GetPageSourceOptions { - wait?: number; - headless?: boolean; - userAgent?: string; -} - -export async function getPageSource( - url: string, - options: GetPageSourceOptions = {} -): Promise { - const { - wait = 1.5, - headless = true, - userAgent = "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 640 XL LTE) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Mobile Safari/537.36 Edge/12.10166" - } = options; - - let browser: Browser | null = null; - let context: BrowserContext | null = null; - let page: Page | null = null; - - try { - browser = await chromium.launch({ - headless, - args: ['--no-sandbox', '--disable-dev-shm-usage'] - }); - - context = await browser.newContext({ userAgent }); - page = await context.newPage(); - - // Convert wait time to milliseconds - const waitMs = wait * 1000; - - // Set default timeout and navigate to URL - await page.setDefaultTimeout(waitMs); - await page.goto(url, { waitUntil: 'domcontentloaded' }); - - // Wait for additional time if specified - if (waitMs > 0) { - await page.waitForTimeout(waitMs); - } - - const pageSource = await page.content(); - return pageSource; - - } catch (error) { - console.error('Error while getting page source: ', error); - return ''; // Explicitly return empty string on error - } finally { - if (page) await page.close(); - if (context) await context.close(); - if (browser) await browser.close(); - } -} \ No newline at end of file diff --git a/server/src/markdownify/get_llm_input_text.ts b/server/src/markdownify/get_llm_input_text.ts deleted file mode 100644 index 3e600140..00000000 --- a/server/src/markdownify/get_llm_input_text.ts +++ /dev/null @@ -1,530 +0,0 @@ -import * as cheerio from 'cheerio'; -import { URL } from 'url'; - -export interface ProcessTextOptions { - keepImages?: boolean; - removeSvgImage?: boolean; - removeGifImage?: boolean; - removeImageTypes?: string[]; - keepWebpageLinks?: boolean; - removeScriptTag?: boolean; - removeStyleTag?: boolean; - removeTags?: string[]; - formatAsMarkdown?: boolean; - maxContentLength?: number; - preserveLineBreaks?: boolean; - includeMetadata?: boolean; -} - -export interface ProcessedResult { - markdown: string; - plainText: string; - metadata: { - title: string; - description: string; - url: string; - processedAt: string; - textLength: number; - markdownLength: number; - hasContent: boolean; - language?: string; - wordCount: number; - linkCount: number; - imageCount: number; - }; -} - -// Global cheerio instance for helper functions -let $: cheerio.CheerioAPI; - -export async function getProcessedText( - pageSource: string, - baseUrl: string, - options: ProcessTextOptions = {} -): Promise { - const { - keepImages = true, - removeSvgImage = true, - removeGifImage = true, - removeImageTypes = [], - keepWebpageLinks = true, - removeScriptTag = true, - removeStyleTag = true, - removeTags = [], - formatAsMarkdown = true, - maxContentLength = 100000, - preserveLineBreaks = true, - includeMetadata = true - } = options; - - try { - // Initialize cheerio without problematic options - $ = cheerio.load(pageSource); - - // Remove unwanted tags completely - const tagsToRemove: string[] = []; - if (removeScriptTag) tagsToRemove.push('script'); - if (removeStyleTag) tagsToRemove.push('style'); - if (removeScriptTag) tagsToRemove.push('noscript'); - tagsToRemove.push(...removeTags); - - const uniqueTags = [...new Set(tagsToRemove)]; - uniqueTags.forEach(tag => { - $(tag).remove(); - }); - - // Remove common unwanted elements - $('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove(); - - // Extract metadata - const title = extractTitle(); - const description = extractDescription(); - const language = extractLanguage(); - - // Generate both formats - const markdown = formatAsMarkdown ? - convertToMarkdown(baseUrl, options) : - ''; - - const plainText = convertToPlainText(baseUrl, options); - - // Truncate if necessary - const finalMarkdown = markdown.substring(0, maxContentLength); - const finalPlainText = plainText.substring(0, maxContentLength); - - // Count elements - const linkCount = $('a[href]').length; - const imageCount = $('img').length; - const wordCount = countWords(finalPlainText); - - const result: ProcessedResult = { - markdown: finalMarkdown, - plainText: finalPlainText, - metadata: { - title, - description, - url: baseUrl, - processedAt: new Date().toISOString(), - textLength: finalPlainText.length, - markdownLength: finalMarkdown.length, - hasContent: finalPlainText.length > 0, - language, - wordCount, - linkCount, - imageCount - } - }; - - return result; - - } catch (error) { - console.error('Error while getting processed text: ', error); - return createEmptyResult(baseUrl); - } -} - -function extractTitle(): string { - return $('title').text()?.trim() || - $('meta[property="og:title"]').attr('content')?.trim() || - $('h1').first().text()?.trim() || - 'Untitled'; -} - -function extractDescription(): string { - return $('meta[name="description"]').attr('content')?.trim() || - $('meta[property="og:description"]').attr('content')?.trim() || - ''; -} - -function extractLanguage(): string { - return $('html').attr('lang') || 'en'; -} - -function countWords(text: string): number { - return text.split(/\s+/).filter(word => word.length > 0).length; -} - -function convertToMarkdown(baseUrl: string, options: ProcessTextOptions): string { - const { keepImages, keepWebpageLinks, preserveLineBreaks } = options; - - // Start with metadata if available - let markdown = ''; - const title = extractTitle(); - if (title && title !== 'Untitled') { - markdown += `# ${title}\n\n`; - } - - const description = extractDescription(); - if (description) { - markdown += `> ${description}\n\n`; - } - - // Clone the body to avoid modifying the original - const $body = $('body').clone(); - - // Remove unwanted elements from the clone - $body.find('script, style, noscript, meta, link').remove(); - $body.find('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove(); - - // Process in order of importance - const sections: string[] = []; - - // Process main content areas first - const contentSelectors = [ - 'main', 'article', '[role="main"]', '.content', '.main', - '#content', '#main', '.post', '.article' - ]; - - let mainContent = ''; - for (const selector of contentSelectors) { - const $content = $body.find(selector).first(); - if ($content.length > 0) { - mainContent = processElementToMarkdown($content, baseUrl, options, 0); - if (mainContent.trim().length > 100) { // Only use if substantial content - sections.push(mainContent); - $content.remove(); // Remove from body to avoid duplication - break; - } - } - } - - // Process headers and structure - sections.push(processElementToMarkdown($body, baseUrl, options, 0)); - - // Combine sections - markdown += sections.filter(s => s.trim().length > 0).join('\n\n'); - - // Final cleanup - markdown = cleanMarkdown(markdown, preserveLineBreaks); - - return markdown; -} - -function processElementToMarkdown($element: cheerio.Cheerio, baseUrl: string, options: ProcessTextOptions, depth: number = 0): string { - if (depth > 10) return ''; // Prevent infinite recursion - - const { keepImages, keepWebpageLinks } = options; - let markdown = ''; - - $element.contents().each((index, node) => { - if (node.type === 'text') { - const text = $(node).text().trim(); - if (text) { - markdown += text + ' '; - } - } else if (node.type === 'tag') { - const $node = $(node); - const tagName = node.name?.toLowerCase() || ''; - - switch (tagName) { - case 'h1': - markdown += `\n# ${$node.text().trim()}\n\n`; - break; - case 'h2': - markdown += `\n## ${$node.text().trim()}\n\n`; - break; - case 'h3': - markdown += `\n### ${$node.text().trim()}\n\n`; - break; - case 'h4': - markdown += `\n#### ${$node.text().trim()}\n\n`; - break; - case 'h5': - markdown += `\n##### ${$node.text().trim()}\n\n`; - break; - case 'h6': - markdown += `\n###### ${$node.text().trim()}\n\n`; - break; - case 'p': - const paragraphText = processElementToMarkdown($node, baseUrl, options, depth + 1); - if (paragraphText.trim()) { - markdown += `\n${paragraphText.trim()}\n\n`; - } - break; - case 'br': - markdown += '\n'; - break; - case 'hr': - markdown += '\n---\n\n'; - break; - case 'strong': - case 'b': - const strongText = processElementToMarkdown($node, baseUrl, options, depth + 1); - if (strongText.trim()) { - markdown += `**${strongText.trim()}**`; - } - break; - case 'em': - case 'i': - const emText = processElementToMarkdown($node, baseUrl, options, depth + 1); - if (emText.trim()) { - markdown += `*${emText.trim()}*`; - } - break; - case 'code': - if (!$node.closest('pre').length) { - const codeText = $node.text().trim(); - if (codeText) { - markdown += `\`${codeText}\``; - } - } - break; - case 'pre': - const preText = $node.text().trim(); - if (preText) { - const codeClass = $node.find('code').attr('class'); - const language = codeClass ? codeClass.replace('language-', '') : ''; - markdown += `\n\`\`\`${language}\n${preText}\n\`\`\`\n\n`; - } - break; - case 'blockquote': - const quoteText = processElementToMarkdown($node, baseUrl, options, depth + 1); - if (quoteText.trim()) { - const lines = quoteText.trim().split('\n'); - markdown += '\n' + lines.map(line => `> ${line}`).join('\n') + '\n\n'; - } - break; - case 'ul': - const listItems: string[] = []; - $node.find('> li').each((_, li) => { - const itemText = processElementToMarkdown($(li), baseUrl, options, depth + 1); - if (itemText.trim()) { - listItems.push(`- ${itemText.trim()}`); - } - }); - if (listItems.length > 0) { - markdown += '\n' + listItems.join('\n') + '\n\n'; - } - break; - case 'ol': - const olItems: string[] = []; - $node.find('> li').each((i, li) => { - const itemText = processElementToMarkdown($(li), baseUrl, options, depth + 1); - if (itemText.trim()) { - olItems.push(`${i + 1}. ${itemText.trim()}`); - } - }); - if (olItems.length > 0) { - markdown += '\n' + olItems.join('\n') + '\n\n'; - } - break; - case 'a': - if (keepWebpageLinks) { - const href = $node.attr('href'); - const linkText = processElementToMarkdown($node, baseUrl, options, depth + 1).trim(); - if (href && linkText) { - try { - const absoluteUrl = new URL(href, baseUrl).toString(); - markdown += `[${linkText}](${absoluteUrl})`; - } catch { - markdown += linkText; - } - } else if (linkText) { - markdown += linkText; - } - } else { - markdown += processElementToMarkdown($node, baseUrl, options, depth + 1); - } - break; - case 'img': - if (keepImages) { - const src = $node.attr('src'); - const alt = $node.attr('alt') || $node.attr('title') || ''; - if (src && !shouldRemoveImage(src, options)) { - try { - const absoluteUrl = new URL(src, baseUrl).toString(); - markdown += `![${alt}](${absoluteUrl})`; - } catch { - // Ignore invalid URLs - } - } - } - break; - case 'table': - markdown += processTableToMarkdown($node); - break; - case 'div': - case 'section': - case 'article': - case 'header': - case 'footer': - case 'nav': - case 'aside': - // Process block-level elements with their content - const blockContent = processElementToMarkdown($node, baseUrl, options, depth + 1); - if (blockContent.trim()) { - markdown += `\n${blockContent.trim()}\n\n`; - } - break; - default: - // For other tags, just process their content - markdown += processElementToMarkdown($node, baseUrl, options, depth + 1); - break; - } - } - }); - - return markdown; -} - -function processTableToMarkdown($table: cheerio.Cheerio): string { - const rows: string[][] = []; - let maxColumns = 0; - - $table.find('tr').each((_, row) => { - const $row = $(row); - const cells: string[] = []; - - $row.find('th, td').each((_, cell) => { - const $cell = $(cell); - const text = $cell.text().trim(); - const colspan = parseInt($cell.attr('colspan') || '1'); - - cells.push(text); - // Add empty cells for colspan - for (let i = 1; i < colspan; i++) { - cells.push(''); - } - }); - - if (cells.length > 0) { - rows.push(cells); - maxColumns = Math.max(maxColumns, cells.length); - } - }); - - if (rows.length === 0) return ''; - - let markdownTable = '\n'; - - // Header row - if (rows.length > 0) { - markdownTable += `| ${rows[0].join(' | ')} |\n`; - markdownTable += `|${' --- |'.repeat(rows[0].length)}\n`; - - // Data rows - for (let i = 1; i < rows.length; i++) { - markdownTable += `| ${rows[i].join(' | ')} |\n`; - } - } - - return markdownTable + '\n'; -} - -function convertToPlainText(baseUrl: string, options: ProcessTextOptions): string { - const { keepImages, keepWebpageLinks } = options; - - const $body = $('body').clone(); - - // Remove unwanted elements - $body.find('script, style, noscript, meta, link').remove(); - $body.find('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove(); - - // Process images - if (keepImages) { - $body.find('img').each((_, element) => { - const $img = $(element); - const src = $img.attr('src'); - const alt = $img.attr('alt') || ''; - - if (src && !shouldRemoveImage(src, options)) { - try { - const absoluteUrl = new URL(src, baseUrl).toString(); - $img.replaceWith(`[Image: ${alt || 'image'} - ${absoluteUrl}]`); - } catch { - $img.remove(); - } - } else { - $img.remove(); - } - }); - } else { - $body.find('img').remove(); - } - - // Process links - if (keepWebpageLinks) { - $body.find('a[href]').each((_, element) => { - const $link = $(element); - const href = $link.attr('href'); - const text = $link.text().trim(); - - if (href && text) { - try { - const absoluteUrl = new URL(href, baseUrl).toString(); - $link.replaceWith(`${text} (${absoluteUrl})`); - } catch { - $link.replaceWith(text); - } - } - }); - } else { - $body.find('a[href]').each((_, element) => { - const $link = $(element); - $link.replaceWith($link.text().trim()); - }); - } - - let text = $body.text(); - text = cleanText(text); - - return text; -} - -function shouldRemoveImage(src: string, options: ProcessTextOptions): boolean { - const { removeSvgImage, removeGifImage, removeImageTypes = [] } = options; - - const imageTypesToRemove: string[] = []; - if (removeSvgImage) imageTypesToRemove.push('.svg'); - if (removeGifImage) imageTypesToRemove.push('.gif'); - imageTypesToRemove.push(...removeImageTypes); - - return imageTypesToRemove.some(type => src.toLowerCase().includes(type.toLowerCase())); -} - -function cleanMarkdown(markdown: string, preserveLineBreaks: boolean = true): string { - return markdown - // Normalize line breaks - .replace(/\r\n/g, '\n') - // Remove excessive empty lines (keep max 2) - .replace(/\n{3,}/g, '\n\n') - // Clean up spaces around headers - .replace(/\n\s*(#+)\s*/g, '\n$1 ') - // Remove spaces at start of lines - .replace(/^\s+/gm, '') - // Remove trailing whitespace - .replace(/[ \t]+$/gm, '') - // Fix multiple spaces - .replace(/[ ]{2,}/g, ' ') - // Ensure proper spacing after paragraphs - .replace(/([^\n])\n([^\n])/g, '$1\n\n$2') - .trim(); -} - -function cleanText(text: string): string { - return text - .replace(/\r\n/g, '\n') - .replace(/\s+/g, ' ') - .replace(/\n\s*\n/g, '\n\n') - .replace(/[ ]{2,}/g, ' ') - .trim(); -} - -function createEmptyResult(url: string): ProcessedResult { - return { - markdown: '', - plainText: '', - metadata: { - title: '', - description: '', - url: url, - processedAt: new Date().toISOString(), - textLength: 0, - markdownLength: 0, - hasContent: false, - wordCount: 0, - linkCount: 0, - imageCount: 0 - } - }; -} \ No newline at end of file diff --git a/server/src/markdownify/get_llm_ready_text.ts b/server/src/markdownify/get_llm_ready_text.ts deleted file mode 100644 index 025fb52d..00000000 --- a/server/src/markdownify/get_llm_ready_text.ts +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: MIT - -import { getPageSource, GetPageSourceOptions } from './get_html'; -import { getProcessedText, ProcessTextOptions, ProcessedResult } from './get_llm_input_text'; - -export interface UrlToLlmTextOptions extends GetPageSourceOptions, ProcessTextOptions {} - -export async function urlToLlmText( - url: string, - options: UrlToLlmTextOptions = {} -): Promise { - try { - const pageSource = await getPageSource(url, options); - - if (!pageSource) { - return createEmptyResult(url); - } - - const result = await getProcessedText(pageSource, url, options); - return result; - - } catch (error) { - console.error('Error while scraping url: ', error); - return createEmptyResult(url); - } -} - -function createEmptyResult(url: string): ProcessedResult { - return { - markdown: '', - plainText: '', - metadata: { - title: '', - description: '', - url: url, - processedAt: new Date().toISOString(), - textLength: 0, - markdownLength: 0, - hasContent: false, - language: 'en', - wordCount: 0, - linkCount: 0, - imageCount: 0 - } - }; -} - -export { getPageSource, getProcessedText }; \ No newline at end of file