fix: better markdown output

This commit is contained in:
amhsirak
2025-11-17 21:52:39 +05:30
parent dae4e83412
commit 28f1bf8510

View File

@@ -11,6 +11,9 @@ export interface ProcessTextOptions {
removeStyleTag?: boolean;
removeTags?: string[];
formatAsMarkdown?: boolean;
maxContentLength?: number;
preserveLineBreaks?: boolean;
includeMetadata?: boolean;
}
export interface ProcessedResult {
@@ -18,13 +21,22 @@ export interface ProcessedResult {
plainText: string;
metadata: {
title: string;
description: string;
url: string;
processedAt: string;
textLength: number;
markdownLength: number;
hasContent: boolean;
language?: string;
wordCount: number;
linkCount: number;
imageCount: number;
};
}
// Global cheerio instance for helper functions
let $: cheerio.CheerioAPI;
export async function getProcessedText(
pageSource: string,
baseUrl: string,
@@ -39,16 +51,21 @@ export async function getProcessedText(
removeScriptTag = true,
removeStyleTag = true,
removeTags = [],
formatAsMarkdown = true
formatAsMarkdown = true,
maxContentLength = 100000,
preserveLineBreaks = true,
includeMetadata = true
} = options;
try {
const $ = cheerio.load(pageSource);
// Initialize cheerio without problematic options
$ = cheerio.load(pageSource);
// Remove unwanted tags
// Remove unwanted tags completely
const tagsToRemove: string[] = [];
if (removeScriptTag) tagsToRemove.push('script');
if (removeStyleTag) tagsToRemove.push('style');
if (removeScriptTag) tagsToRemove.push('noscript');
tagsToRemove.push(...removeTags);
const uniqueTags = [...new Set(tagsToRemove)];
@@ -56,25 +73,45 @@ export async function getProcessedText(
$(tag).remove();
});
// Extract page title
const title = $('title').text() || $('h1').first().text() || 'Untitled';
// Remove common unwanted elements
$('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove();
// Extract metadata
const title = extractTitle();
const description = extractDescription();
const language = extractLanguage();
// Generate both formats
const markdown = formatAsMarkdown ?
convertToMarkdown($, baseUrl, options) :
convertToPlainText($, baseUrl, options); // Fallback to plain text if markdown disabled
convertToMarkdown(baseUrl, options) :
'';
const plainText = convertToPlainText($, baseUrl, options);
const plainText = convertToPlainText(baseUrl, options);
// Truncate if necessary
const finalMarkdown = markdown.substring(0, maxContentLength);
const finalPlainText = plainText.substring(0, maxContentLength);
// Count elements
const linkCount = $('a[href]').length;
const imageCount = $('img').length;
const wordCount = countWords(finalPlainText);
const result: ProcessedResult = {
markdown,
plainText,
markdown: finalMarkdown,
plainText: finalPlainText,
metadata: {
title: title.trim(),
title,
description,
url: baseUrl,
processedAt: new Date().toISOString(),
textLength: plainText.length,
markdownLength: markdown.length
textLength: finalPlainText.length,
markdownLength: finalMarkdown.length,
hasContent: finalPlainText.length > 0,
language,
wordCount,
linkCount,
imageCount
}
};
@@ -82,95 +119,307 @@ export async function getProcessedText(
} catch (error) {
console.error('Error while getting processed text: ', error);
// Return empty result on error
return {
markdown: '',
plainText: '',
metadata: {
title: '',
url: baseUrl,
processedAt: new Date().toISOString(),
textLength: 0,
markdownLength: 0
}
};
return createEmptyResult(baseUrl);
}
}
function convertToMarkdown($: cheerio.CheerioAPI, baseUrl: string, options: ProcessTextOptions): string {
const { keepImages, keepWebpageLinks } = options;
function extractTitle(): string {
return $('title').text()?.trim() ||
$('meta[property="og:title"]').attr('content')?.trim() ||
$('h1').first().text()?.trim() ||
'Untitled';
}
function extractDescription(): string {
return $('meta[name="description"]').attr('content')?.trim() ||
$('meta[property="og:description"]').attr('content')?.trim() ||
'';
}
function extractLanguage(): string {
return $('html').attr('lang') || 'en';
}
function countWords(text: string): number {
return text.split(/\s+/).filter(word => word.length > 0).length;
}
function convertToMarkdown(baseUrl: string, options: ProcessTextOptions): string {
const { keepImages, keepWebpageLinks, preserveLineBreaks } = options;
// Start with metadata if available
let markdown = '';
const title = extractTitle();
if (title && title !== 'Untitled') {
markdown += `# ${title}\n\n`;
}
const description = extractDescription();
if (description) {
markdown += `> ${description}\n\n`;
}
// Clone the body to avoid modifying the original
const $body = $('body').clone();
// Process headers
$body.find('h1').each((_, element) => {
const $el = $(element);
$el.replaceWith(`# ${$el.text().trim()}\n\n`);
});
$body.find('h2').each((_, element) => {
const $el = $(element);
$el.replaceWith(`## ${$el.text().trim()}\n\n`);
});
$body.find('h3').each((_, element) => {
const $el = $(element);
$el.replaceWith(`### ${$el.text().trim()}\n\n`);
});
$body.find('h4, h5, h6').each((_, element) => {
const $el = $(element);
const level = element.name?.substring(1) || '4';
const hashes = '#'.repeat(parseInt(level));
$el.replaceWith(`${hashes} ${$el.text().trim()}\n\n`);
});
// Remove unwanted elements from the clone
$body.find('script, style, noscript, meta, link').remove();
$body.find('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove();
// Process paragraphs
$body.find('p').each((_, element) => {
const $el = $(element);
$el.replaceWith(`${$el.text().trim()}\n\n`);
});
// Process in order of importance
const sections: string[] = [];
// Process lists
$body.find('li').each((_, element) => {
const $el = $(element);
const text = $el.text().trim();
if ($el.parent().is('ol')) {
$el.replaceWith(`1. ${text}\n`);
} else {
$el.replaceWith(`- ${text}\n`);
// Process main content areas first
const contentSelectors = [
'main', 'article', '[role="main"]', '.content', '.main',
'#content', '#main', '.post', '.article'
];
let mainContent = '';
for (const selector of contentSelectors) {
const $content = $body.find(selector).first();
if ($content.length > 0) {
mainContent = processElementToMarkdown($content, baseUrl, options, 0);
if (mainContent.trim().length > 100) { // Only use if substantial content
sections.push(mainContent);
$content.remove(); // Remove from body to avoid duplication
break;
}
}
}
// Process headers and structure
sections.push(processElementToMarkdown($body, baseUrl, options, 0));
// Combine sections
markdown += sections.filter(s => s.trim().length > 0).join('\n\n');
// Final cleanup
markdown = cleanMarkdown(markdown, preserveLineBreaks);
return markdown;
}
function processElementToMarkdown($element: cheerio.Cheerio<any>, baseUrl: string, options: ProcessTextOptions, depth: number = 0): string {
if (depth > 10) return ''; // Prevent infinite recursion
const { keepImages, keepWebpageLinks } = options;
let markdown = '';
$element.contents().each((index, node) => {
if (node.type === 'text') {
const text = $(node).text().trim();
if (text) {
markdown += text + ' ';
}
} else if (node.type === 'tag') {
const $node = $(node);
const tagName = node.name?.toLowerCase() || '';
switch (tagName) {
case 'h1':
markdown += `\n# ${$node.text().trim()}\n\n`;
break;
case 'h2':
markdown += `\n## ${$node.text().trim()}\n\n`;
break;
case 'h3':
markdown += `\n### ${$node.text().trim()}\n\n`;
break;
case 'h4':
markdown += `\n#### ${$node.text().trim()}\n\n`;
break;
case 'h5':
markdown += `\n##### ${$node.text().trim()}\n\n`;
break;
case 'h6':
markdown += `\n###### ${$node.text().trim()}\n\n`;
break;
case 'p':
const paragraphText = processElementToMarkdown($node, baseUrl, options, depth + 1);
if (paragraphText.trim()) {
markdown += `\n${paragraphText.trim()}\n\n`;
}
break;
case 'br':
markdown += '\n';
break;
case 'hr':
markdown += '\n---\n\n';
break;
case 'strong':
case 'b':
const strongText = processElementToMarkdown($node, baseUrl, options, depth + 1);
if (strongText.trim()) {
markdown += `**${strongText.trim()}**`;
}
break;
case 'em':
case 'i':
const emText = processElementToMarkdown($node, baseUrl, options, depth + 1);
if (emText.trim()) {
markdown += `*${emText.trim()}*`;
}
break;
case 'code':
if (!$node.closest('pre').length) {
const codeText = $node.text().trim();
if (codeText) {
markdown += `\`${codeText}\``;
}
}
break;
case 'pre':
const preText = $node.text().trim();
if (preText) {
const codeClass = $node.find('code').attr('class');
const language = codeClass ? codeClass.replace('language-', '') : '';
markdown += `\n\`\`\`${language}\n${preText}\n\`\`\`\n\n`;
}
break;
case 'blockquote':
const quoteText = processElementToMarkdown($node, baseUrl, options, depth + 1);
if (quoteText.trim()) {
const lines = quoteText.trim().split('\n');
markdown += '\n' + lines.map(line => `> ${line}`).join('\n') + '\n\n';
}
break;
case 'ul':
const listItems: string[] = [];
$node.find('> li').each((_, li) => {
const itemText = processElementToMarkdown($(li), baseUrl, options, depth + 1);
if (itemText.trim()) {
listItems.push(`- ${itemText.trim()}`);
}
});
if (listItems.length > 0) {
markdown += '\n' + listItems.join('\n') + '\n\n';
}
break;
case 'ol':
const olItems: string[] = [];
$node.find('> li').each((i, li) => {
const itemText = processElementToMarkdown($(li), baseUrl, options, depth + 1);
if (itemText.trim()) {
olItems.push(`${i + 1}. ${itemText.trim()}`);
}
});
if (olItems.length > 0) {
markdown += '\n' + olItems.join('\n') + '\n\n';
}
break;
case 'a':
if (keepWebpageLinks) {
const href = $node.attr('href');
const linkText = processElementToMarkdown($node, baseUrl, options, depth + 1).trim();
if (href && linkText) {
try {
const absoluteUrl = new URL(href, baseUrl).toString();
markdown += `[${linkText}](${absoluteUrl})`;
} catch {
markdown += linkText;
}
} else if (linkText) {
markdown += linkText;
}
} else {
markdown += processElementToMarkdown($node, baseUrl, options, depth + 1);
}
break;
case 'img':
if (keepImages) {
const src = $node.attr('src');
const alt = $node.attr('alt') || $node.attr('title') || '';
if (src && !shouldRemoveImage(src, options)) {
try {
const absoluteUrl = new URL(src, baseUrl).toString();
markdown += `![${alt}](${absoluteUrl})`;
} catch {
// Ignore invalid URLs
}
}
}
break;
case 'table':
markdown += processTableToMarkdown($node);
break;
case 'div':
case 'section':
case 'article':
case 'header':
case 'footer':
case 'nav':
case 'aside':
// Process block-level elements with their content
const blockContent = processElementToMarkdown($node, baseUrl, options, depth + 1);
if (blockContent.trim()) {
markdown += `\n${blockContent.trim()}\n\n`;
}
break;
default:
// For other tags, just process their content
markdown += processElementToMarkdown($node, baseUrl, options, depth + 1);
break;
}
}
});
$body.find('ul, ol').each((_, element) => {
const $el = $(element);
$el.replaceWith(`\n${$el.html()}\n\n`);
});
return markdown;
}
// Process blockquotes
$body.find('blockquote').each((_, element) => {
const $el = $(element);
const text = $el.text().trim();
$el.replaceWith(`> ${text.replace(/\n/g, '\n> ')}\n\n`);
});
function processTableToMarkdown($table: cheerio.Cheerio<any>): string {
const rows: string[][] = [];
let maxColumns = 0;
// Process code blocks
$body.find('pre').each((_, element) => {
const $el = $(element);
const text = $el.text().trim();
$el.replaceWith(`\`\`\`\n${text}\n\`\`\`\n\n`);
});
$body.find('code').each((_, element) => {
const $el = $(element);
// Only format inline code that's not inside pre blocks
if (!$el.closest('pre').length) {
const text = $el.text().trim();
$el.replaceWith(`\`${text}\``);
$table.find('tr').each((_, row) => {
const $row = $(row);
const cells: string[] = [];
$row.find('th, td').each((_, cell) => {
const $cell = $(cell);
const text = $cell.text().trim();
const colspan = parseInt($cell.attr('colspan') || '1');
cells.push(text);
// Add empty cells for colspan
for (let i = 1; i < colspan; i++) {
cells.push('');
}
});
if (cells.length > 0) {
rows.push(cells);
maxColumns = Math.max(maxColumns, cells.length);
}
});
if (rows.length === 0) return '';
let markdownTable = '\n';
// Header row
if (rows.length > 0) {
markdownTable += `| ${rows[0].join(' | ')} |\n`;
markdownTable += `|${' --- |'.repeat(rows[0].length)}\n`;
// Data rows
for (let i = 1; i < rows.length; i++) {
markdownTable += `| ${rows[i].join(' | ')} |\n`;
}
}
return markdownTable + '\n';
}
function convertToPlainText(baseUrl: string, options: ProcessTextOptions): string {
const { keepImages, keepWebpageLinks } = options;
const $body = $('body').clone();
// Remove unwanted elements
$body.find('script, style, noscript, meta, link').remove();
$body.find('[style*="display:none"], [style*="display: none"], .hidden, [aria-hidden="true"]').remove();
// Process images
if (keepImages) {
$body.find('img').each((_, element) => {
@@ -179,89 +428,12 @@ function convertToMarkdown($: cheerio.CheerioAPI, baseUrl: string, options: Proc
const alt = $img.attr('alt') || '';
if (src && !shouldRemoveImage(src, options)) {
const absoluteUrl = new URL(src, baseUrl).toString();
$img.replaceWith(`![${alt}](${absoluteUrl})\n\n`);
} else {
$img.remove();
}
});
} else {
$body.find('img').remove();
}
// Process links
if (keepWebpageLinks) {
$body.find('a[href]').each((_, element) => {
const $link = $(element);
const href = $link.attr('href');
const text = $link.text().trim();
if (href && text) {
const absoluteUrl = new URL(href, baseUrl).toString();
$link.replaceWith(`[${text}](${absoluteUrl})`);
} else if (text) {
$link.replaceWith(text);
} else {
$link.remove();
}
});
} else {
$body.find('a[href]').each((_, element) => {
const $link = $(element);
$link.replaceWith($link.text().trim());
});
}
// Process tables (basic support)
$body.find('table').each((_, element) => {
const $table = $(element);
let markdownTable = '\n';
$table.find('tr').each((rowIndex, row) => {
const $row = $(row);
const cells: string[] = [];
$row.find('th, td').each((_, cell) => {
const $cell = $(cell);
cells.push($cell.text().trim());
});
if (cells.length > 0) {
markdownTable += `| ${cells.join(' | ')} |\n`;
// Add header separator after first row
if (rowIndex === 0) {
markdownTable += `|${cells.map(() => '---').join('|')}|\n`;
try {
const absoluteUrl = new URL(src, baseUrl).toString();
$img.replaceWith(`[Image: ${alt || 'image'} - ${absoluteUrl}]`);
} catch {
$img.remove();
}
}
});
$table.replaceWith(markdownTable + '\n');
});
// Get the final text and clean it up
let markdown = $body.text();
// Clean up excessive whitespace while preserving structure
markdown = cleanMarkdown(markdown);
return markdown;
}
function convertToPlainText($: cheerio.CheerioAPI, baseUrl: string, options: ProcessTextOptions): string {
const { keepImages, keepWebpageLinks } = options;
const $body = $('body').clone();
// Process images
if (keepImages) {
$body.find('img').each((_, element) => {
const $img = $(element);
const src = $img.attr('src');
if (src && !shouldRemoveImage(src, options)) {
const absoluteUrl = new URL(src, baseUrl).toString();
$img.replaceWith(`\nImage: ${absoluteUrl}\n`);
} else {
$img.remove();
}
@@ -278,8 +450,12 @@ function convertToPlainText($: cheerio.CheerioAPI, baseUrl: string, options: Pro
const text = $link.text().trim();
if (href && text) {
const absoluteUrl = new URL(href, baseUrl).toString();
$link.replaceWith(`${text}: ${absoluteUrl} `);
try {
const absoluteUrl = new URL(href, baseUrl).toString();
$link.replaceWith(`${text} (${absoluteUrl})`);
} catch {
$link.replaceWith(text);
}
}
});
} else {
@@ -303,25 +479,52 @@ function shouldRemoveImage(src: string, options: ProcessTextOptions): boolean {
if (removeGifImage) imageTypesToRemove.push('.gif');
imageTypesToRemove.push(...removeImageTypes);
return imageTypesToRemove.some(type => src.includes(type));
return imageTypesToRemove.some(type => src.toLowerCase().includes(type.toLowerCase()));
}
function cleanMarkdown(markdown: string): string {
function cleanMarkdown(markdown: string, preserveLineBreaks: boolean = true): string {
return markdown
// Replace 3+ newlines with 2 newlines
// Normalize line breaks
.replace(/\r\n/g, '\n')
// Remove excessive empty lines (keep max 2)
.replace(/\n{3,}/g, '\n\n')
// Remove excessive spaces
.replace(/[ ]{2,}/g, ' ')
// Clean up space around headers
// Clean up spaces around headers
.replace(/\n\s*(#+)\s*/g, '\n$1 ')
// Remove spaces at start of lines
.replace(/^\s+/gm, '')
// Remove trailing whitespace
.replace(/[ \t]+$/gm, '')
// Fix multiple spaces
.replace(/[ ]{2,}/g, ' ')
// Ensure proper spacing after paragraphs
.replace(/([^\n])\n([^\n])/g, '$1\n\n$2')
.trim();
}
function cleanText(text: string): string {
return text
.replace(/\r\n/g, '\n')
.replace(/\s+/g, ' ')
.replace(/\n\s*\n/g, '\n\n')
.replace(/[ ]{2,}/g, ' ')
.trim();
}
function createEmptyResult(url: string): ProcessedResult {
return {
markdown: '',
plainText: '',
metadata: {
title: '',
description: '',
url: url,
processedAt: new Date().toISOString(),
textLength: 0,
markdownLength: 0,
hasContent: false,
wordCount: 0,
linkCount: 0,
imageCount: 0
}
};
}