From 28f1bf85102acca3daa65738bc3dcc2af1306b89 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 17 Nov 2025 21:52:39 +0530 Subject: [PATCH] fix: better markdown output --- server/src/markdownify/get_llm_input_text.ts | 553 +++++++++++++------ 1 file changed, 378 insertions(+), 175 deletions(-) diff --git a/server/src/markdownify/get_llm_input_text.ts b/server/src/markdownify/get_llm_input_text.ts index fa0aec6c..3e600140 100644 --- a/server/src/markdownify/get_llm_input_text.ts +++ b/server/src/markdownify/get_llm_input_text.ts @@ -11,6 +11,9 @@ export interface ProcessTextOptions { removeStyleTag?: boolean; removeTags?: string[]; formatAsMarkdown?: boolean; + maxContentLength?: number; + preserveLineBreaks?: boolean; + includeMetadata?: boolean; } export interface ProcessedResult { @@ -18,13 +21,22 @@ export interface ProcessedResult { plainText: string; metadata: { title: string; + description: string; url: string; processedAt: string; textLength: number; markdownLength: number; + hasContent: boolean; + language?: string; + wordCount: number; + linkCount: number; + imageCount: number; }; } +// Global cheerio instance for helper functions +let $: cheerio.CheerioAPI; + export async function getProcessedText( pageSource: string, baseUrl: string, @@ -39,16 +51,21 @@ export async function getProcessedText( removeScriptTag = true, removeStyleTag = true, removeTags = [], - formatAsMarkdown = true + formatAsMarkdown = true, + maxContentLength = 100000, + preserveLineBreaks = true, + includeMetadata = true } = options; try { - const $ = cheerio.load(pageSource); + // Initialize cheerio without problematic options + $ = cheerio.load(pageSource); - // Remove unwanted tags + // Remove unwanted tags completely const tagsToRemove: string[] = []; if (removeScriptTag) tagsToRemove.push('script'); if (removeStyleTag) tagsToRemove.push('style'); + if (removeScriptTag) tagsToRemove.push('noscript'); tagsToRemove.push(...removeTags); const uniqueTags = [...new Set(tagsToRemove)]; @@ -56,25 +73,45 @@ export async function getProcessedText( $(tag).remove(); }); - // Extract page title - const title = $('title').text() || $('h1').first().text() || 'Untitled'; + // Remove common unwanted elements + $('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove(); + // Extract metadata + const title = extractTitle(); + const description = extractDescription(); + const language = extractLanguage(); + // Generate both formats const markdown = formatAsMarkdown ? - convertToMarkdown($, baseUrl, options) : - convertToPlainText($, baseUrl, options); // Fallback to plain text if markdown disabled + convertToMarkdown(baseUrl, options) : + ''; - const plainText = convertToPlainText($, baseUrl, options); + const plainText = convertToPlainText(baseUrl, options); + + // Truncate if necessary + const finalMarkdown = markdown.substring(0, maxContentLength); + const finalPlainText = plainText.substring(0, maxContentLength); + + // Count elements + const linkCount = $('a[href]').length; + const imageCount = $('img').length; + const wordCount = countWords(finalPlainText); const result: ProcessedResult = { - markdown, - plainText, + markdown: finalMarkdown, + plainText: finalPlainText, metadata: { - title: title.trim(), + title, + description, url: baseUrl, processedAt: new Date().toISOString(), - textLength: plainText.length, - markdownLength: markdown.length + textLength: finalPlainText.length, + markdownLength: finalMarkdown.length, + hasContent: finalPlainText.length > 0, + language, + wordCount, + linkCount, + imageCount } }; @@ -82,95 +119,307 @@ export async function getProcessedText( } catch (error) { console.error('Error while getting processed text: ', error); - // Return empty result on error - return { - markdown: '', - plainText: '', - metadata: { - title: '', - url: baseUrl, - processedAt: new Date().toISOString(), - textLength: 0, - markdownLength: 0 - } - }; + return createEmptyResult(baseUrl); } } -function convertToMarkdown($: cheerio.CheerioAPI, baseUrl: string, options: ProcessTextOptions): string { - const { keepImages, keepWebpageLinks } = options; +function extractTitle(): string { + return $('title').text()?.trim() || + $('meta[property="og:title"]').attr('content')?.trim() || + $('h1').first().text()?.trim() || + 'Untitled'; +} + +function extractDescription(): string { + return $('meta[name="description"]').attr('content')?.trim() || + $('meta[property="og:description"]').attr('content')?.trim() || + ''; +} + +function extractLanguage(): string { + return $('html').attr('lang') || 'en'; +} + +function countWords(text: string): number { + return text.split(/\s+/).filter(word => word.length > 0).length; +} + +function convertToMarkdown(baseUrl: string, options: ProcessTextOptions): string { + const { keepImages, keepWebpageLinks, preserveLineBreaks } = options; + // Start with metadata if available + let markdown = ''; + const title = extractTitle(); + if (title && title !== 'Untitled') { + markdown += `# ${title}\n\n`; + } + + const description = extractDescription(); + if (description) { + markdown += `> ${description}\n\n`; + } + // Clone the body to avoid modifying the original const $body = $('body').clone(); - // Process headers - $body.find('h1').each((_, element) => { - const $el = $(element); - $el.replaceWith(`# ${$el.text().trim()}\n\n`); - }); - - $body.find('h2').each((_, element) => { - const $el = $(element); - $el.replaceWith(`## ${$el.text().trim()}\n\n`); - }); - - $body.find('h3').each((_, element) => { - const $el = $(element); - $el.replaceWith(`### ${$el.text().trim()}\n\n`); - }); - - $body.find('h4, h5, h6').each((_, element) => { - const $el = $(element); - const level = element.name?.substring(1) || '4'; - const hashes = '#'.repeat(parseInt(level)); - $el.replaceWith(`${hashes} ${$el.text().trim()}\n\n`); - }); + // Remove unwanted elements from the clone + $body.find('script, style, noscript, meta, link').remove(); + $body.find('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove(); - // Process paragraphs - $body.find('p').each((_, element) => { - const $el = $(element); - $el.replaceWith(`${$el.text().trim()}\n\n`); - }); + // Process in order of importance + const sections: string[] = []; - // Process lists - $body.find('li').each((_, element) => { - const $el = $(element); - const text = $el.text().trim(); - if ($el.parent().is('ol')) { - $el.replaceWith(`1. ${text}\n`); - } else { - $el.replaceWith(`- ${text}\n`); + // Process main content areas first + const contentSelectors = [ + 'main', 'article', '[role="main"]', '.content', '.main', + '#content', '#main', '.post', '.article' + ]; + + let mainContent = ''; + for (const selector of contentSelectors) { + const $content = $body.find(selector).first(); + if ($content.length > 0) { + mainContent = processElementToMarkdown($content, baseUrl, options, 0); + if (mainContent.trim().length > 100) { // Only use if substantial content + sections.push(mainContent); + $content.remove(); // Remove from body to avoid duplication + break; + } + } + } + + // Process headers and structure + sections.push(processElementToMarkdown($body, baseUrl, options, 0)); + + // Combine sections + markdown += sections.filter(s => s.trim().length > 0).join('\n\n'); + + // Final cleanup + markdown = cleanMarkdown(markdown, preserveLineBreaks); + + return markdown; +} + +function processElementToMarkdown($element: cheerio.Cheerio, baseUrl: string, options: ProcessTextOptions, depth: number = 0): string { + if (depth > 10) return ''; // Prevent infinite recursion + + const { keepImages, keepWebpageLinks } = options; + let markdown = ''; + + $element.contents().each((index, node) => { + if (node.type === 'text') { + const text = $(node).text().trim(); + if (text) { + markdown += text + ' '; + } + } else if (node.type === 'tag') { + const $node = $(node); + const tagName = node.name?.toLowerCase() || ''; + + switch (tagName) { + case 'h1': + markdown += `\n# ${$node.text().trim()}\n\n`; + break; + case 'h2': + markdown += `\n## ${$node.text().trim()}\n\n`; + break; + case 'h3': + markdown += `\n### ${$node.text().trim()}\n\n`; + break; + case 'h4': + markdown += `\n#### ${$node.text().trim()}\n\n`; + break; + case 'h5': + markdown += `\n##### ${$node.text().trim()}\n\n`; + break; + case 'h6': + markdown += `\n###### ${$node.text().trim()}\n\n`; + break; + case 'p': + const paragraphText = processElementToMarkdown($node, baseUrl, options, depth + 1); + if (paragraphText.trim()) { + markdown += `\n${paragraphText.trim()}\n\n`; + } + break; + case 'br': + markdown += '\n'; + break; + case 'hr': + markdown += '\n---\n\n'; + break; + case 'strong': + case 'b': + const strongText = processElementToMarkdown($node, baseUrl, options, depth + 1); + if (strongText.trim()) { + markdown += `**${strongText.trim()}**`; + } + break; + case 'em': + case 'i': + const emText = processElementToMarkdown($node, baseUrl, options, depth + 1); + if (emText.trim()) { + markdown += `*${emText.trim()}*`; + } + break; + case 'code': + if (!$node.closest('pre').length) { + const codeText = $node.text().trim(); + if (codeText) { + markdown += `\`${codeText}\``; + } + } + break; + case 'pre': + const preText = $node.text().trim(); + if (preText) { + const codeClass = $node.find('code').attr('class'); + const language = codeClass ? codeClass.replace('language-', '') : ''; + markdown += `\n\`\`\`${language}\n${preText}\n\`\`\`\n\n`; + } + break; + case 'blockquote': + const quoteText = processElementToMarkdown($node, baseUrl, options, depth + 1); + if (quoteText.trim()) { + const lines = quoteText.trim().split('\n'); + markdown += '\n' + lines.map(line => `> ${line}`).join('\n') + '\n\n'; + } + break; + case 'ul': + const listItems: string[] = []; + $node.find('> li').each((_, li) => { + const itemText = processElementToMarkdown($(li), baseUrl, options, depth + 1); + if (itemText.trim()) { + listItems.push(`- ${itemText.trim()}`); + } + }); + if (listItems.length > 0) { + markdown += '\n' + listItems.join('\n') + '\n\n'; + } + break; + case 'ol': + const olItems: string[] = []; + $node.find('> li').each((i, li) => { + const itemText = processElementToMarkdown($(li), baseUrl, options, depth + 1); + if (itemText.trim()) { + olItems.push(`${i + 1}. ${itemText.trim()}`); + } + }); + if (olItems.length > 0) { + markdown += '\n' + olItems.join('\n') + '\n\n'; + } + break; + case 'a': + if (keepWebpageLinks) { + const href = $node.attr('href'); + const linkText = processElementToMarkdown($node, baseUrl, options, depth + 1).trim(); + if (href && linkText) { + try { + const absoluteUrl = new URL(href, baseUrl).toString(); + markdown += `[${linkText}](${absoluteUrl})`; + } catch { + markdown += linkText; + } + } else if (linkText) { + markdown += linkText; + } + } else { + markdown += processElementToMarkdown($node, baseUrl, options, depth + 1); + } + break; + case 'img': + if (keepImages) { + const src = $node.attr('src'); + const alt = $node.attr('alt') || $node.attr('title') || ''; + if (src && !shouldRemoveImage(src, options)) { + try { + const absoluteUrl = new URL(src, baseUrl).toString(); + markdown += `![${alt}](${absoluteUrl})`; + } catch { + // Ignore invalid URLs + } + } + } + break; + case 'table': + markdown += processTableToMarkdown($node); + break; + case 'div': + case 'section': + case 'article': + case 'header': + case 'footer': + case 'nav': + case 'aside': + // Process block-level elements with their content + const blockContent = processElementToMarkdown($node, baseUrl, options, depth + 1); + if (blockContent.trim()) { + markdown += `\n${blockContent.trim()}\n\n`; + } + break; + default: + // For other tags, just process their content + markdown += processElementToMarkdown($node, baseUrl, options, depth + 1); + break; + } } }); - $body.find('ul, ol').each((_, element) => { - const $el = $(element); - $el.replaceWith(`\n${$el.html()}\n\n`); - }); + return markdown; +} - // Process blockquotes - $body.find('blockquote').each((_, element) => { - const $el = $(element); - const text = $el.text().trim(); - $el.replaceWith(`> ${text.replace(/\n/g, '\n> ')}\n\n`); - }); +function processTableToMarkdown($table: cheerio.Cheerio): string { + const rows: string[][] = []; + let maxColumns = 0; - // Process code blocks - $body.find('pre').each((_, element) => { - const $el = $(element); - const text = $el.text().trim(); - $el.replaceWith(`\`\`\`\n${text}\n\`\`\`\n\n`); - }); - - $body.find('code').each((_, element) => { - const $el = $(element); - // Only format inline code that's not inside pre blocks - if (!$el.closest('pre').length) { - const text = $el.text().trim(); - $el.replaceWith(`\`${text}\``); + $table.find('tr').each((_, row) => { + const $row = $(row); + const cells: string[] = []; + + $row.find('th, td').each((_, cell) => { + const $cell = $(cell); + const text = $cell.text().trim(); + const colspan = parseInt($cell.attr('colspan') || '1'); + + cells.push(text); + // Add empty cells for colspan + for (let i = 1; i < colspan; i++) { + cells.push(''); + } + }); + + if (cells.length > 0) { + rows.push(cells); + maxColumns = Math.max(maxColumns, cells.length); } }); + if (rows.length === 0) return ''; + + let markdownTable = '\n'; + + // Header row + if (rows.length > 0) { + markdownTable += `| ${rows[0].join(' | ')} |\n`; + markdownTable += `|${' --- |'.repeat(rows[0].length)}\n`; + + // Data rows + for (let i = 1; i < rows.length; i++) { + markdownTable += `| ${rows[i].join(' | ')} |\n`; + } + } + + return markdownTable + '\n'; +} + +function convertToPlainText(baseUrl: string, options: ProcessTextOptions): string { + const { keepImages, keepWebpageLinks } = options; + + const $body = $('body').clone(); + + // Remove unwanted elements + $body.find('script, style, noscript, meta, link').remove(); + $body.find('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove(); + // Process images if (keepImages) { $body.find('img').each((_, element) => { @@ -179,89 +428,12 @@ function convertToMarkdown($: cheerio.CheerioAPI, baseUrl: string, options: Proc const alt = $img.attr('alt') || ''; if (src && !shouldRemoveImage(src, options)) { - const absoluteUrl = new URL(src, baseUrl).toString(); - $img.replaceWith(`![${alt}](${absoluteUrl})\n\n`); - } else { - $img.remove(); - } - }); - } else { - $body.find('img').remove(); - } - - // Process links - if (keepWebpageLinks) { - $body.find('a[href]').each((_, element) => { - const $link = $(element); - const href = $link.attr('href'); - const text = $link.text().trim(); - - if (href && text) { - const absoluteUrl = new URL(href, baseUrl).toString(); - $link.replaceWith(`[${text}](${absoluteUrl})`); - } else if (text) { - $link.replaceWith(text); - } else { - $link.remove(); - } - }); - } else { - $body.find('a[href]').each((_, element) => { - const $link = $(element); - $link.replaceWith($link.text().trim()); - }); - } - - // Process tables (basic support) - $body.find('table').each((_, element) => { - const $table = $(element); - let markdownTable = '\n'; - - $table.find('tr').each((rowIndex, row) => { - const $row = $(row); - const cells: string[] = []; - - $row.find('th, td').each((_, cell) => { - const $cell = $(cell); - cells.push($cell.text().trim()); - }); - - if (cells.length > 0) { - markdownTable += `| ${cells.join(' | ')} |\n`; - - // Add header separator after first row - if (rowIndex === 0) { - markdownTable += `|${cells.map(() => '---').join('|')}|\n`; + try { + const absoluteUrl = new URL(src, baseUrl).toString(); + $img.replaceWith(`[Image: ${alt || 'image'} - ${absoluteUrl}]`); + } catch { + $img.remove(); } - } - }); - - $table.replaceWith(markdownTable + '\n'); - }); - - // Get the final text and clean it up - let markdown = $body.text(); - - // Clean up excessive whitespace while preserving structure - markdown = cleanMarkdown(markdown); - - return markdown; -} - -function convertToPlainText($: cheerio.CheerioAPI, baseUrl: string, options: ProcessTextOptions): string { - const { keepImages, keepWebpageLinks } = options; - - const $body = $('body').clone(); - - // Process images - if (keepImages) { - $body.find('img').each((_, element) => { - const $img = $(element); - const src = $img.attr('src'); - - if (src && !shouldRemoveImage(src, options)) { - const absoluteUrl = new URL(src, baseUrl).toString(); - $img.replaceWith(`\nImage: ${absoluteUrl}\n`); } else { $img.remove(); } @@ -278,8 +450,12 @@ function convertToPlainText($: cheerio.CheerioAPI, baseUrl: string, options: Pro const text = $link.text().trim(); if (href && text) { - const absoluteUrl = new URL(href, baseUrl).toString(); - $link.replaceWith(`${text}: ${absoluteUrl} `); + try { + const absoluteUrl = new URL(href, baseUrl).toString(); + $link.replaceWith(`${text} (${absoluteUrl})`); + } catch { + $link.replaceWith(text); + } } }); } else { @@ -303,25 +479,52 @@ function shouldRemoveImage(src: string, options: ProcessTextOptions): boolean { if (removeGifImage) imageTypesToRemove.push('.gif'); imageTypesToRemove.push(...removeImageTypes); - return imageTypesToRemove.some(type => src.includes(type)); + return imageTypesToRemove.some(type => src.toLowerCase().includes(type.toLowerCase())); } -function cleanMarkdown(markdown: string): string { +function cleanMarkdown(markdown: string, preserveLineBreaks: boolean = true): string { return markdown - // Replace 3+ newlines with 2 newlines + // Normalize line breaks + .replace(/\r\n/g, '\n') + // Remove excessive empty lines (keep max 2) .replace(/\n{3,}/g, '\n\n') - // Remove excessive spaces - .replace(/[ ]{2,}/g, ' ') - // Clean up space around headers + // Clean up spaces around headers .replace(/\n\s*(#+)\s*/g, '\n$1 ') + // Remove spaces at start of lines + .replace(/^\s+/gm, '') // Remove trailing whitespace .replace(/[ \t]+$/gm, '') + // Fix multiple spaces + .replace(/[ ]{2,}/g, ' ') + // Ensure proper spacing after paragraphs + .replace(/([^\n])\n([^\n])/g, '$1\n\n$2') .trim(); } function cleanText(text: string): string { return text + .replace(/\r\n/g, '\n') .replace(/\s+/g, ' ') .replace(/\n\s*\n/g, '\n\n') + .replace(/[ ]{2,}/g, ' ') .trim(); +} + +function createEmptyResult(url: string): ProcessedResult { + return { + markdown: '', + plainText: '', + metadata: { + title: '', + description: '', + url: url, + processedAt: new Date().toISOString(), + textLength: 0, + markdownLength: 0, + hasContent: false, + wordCount: 0, + linkCount: 0, + imageCount: 0 + } + }; } \ No newline at end of file