fix: get important content

This commit is contained in:
amhsirak
2025-11-17 20:50:25 +05:30
parent 191ac52ee3
commit af9570659f

View File

@@ -1,3 +1,4 @@
import * as cheerio from 'cheerio'; import * as cheerio from 'cheerio';
import { URL } from 'url'; import { URL } from 'url';
@@ -81,17 +82,20 @@ export async function getProcessedText(
} }
}); });
// Process website links // Process website links - Preserve the link text AND the URL
$('a[href]').each((_, element) => { $('a[href]').each((_, element) => {
try { try {
const $link = $(element); const $link = $(element);
if (!keepWebpageLinks) { if (!keepWebpageLinks) {
$link.remove(); // Just remove the link but keep the text
$link.replaceWith($link.text());
} else { } else {
const href = $link.attr('href'); const href = $link.attr('href');
if (href) { if (href) {
const absoluteUrl = new URL(href, baseUrl).toString(); const absoluteUrl = new URL(href, baseUrl).toString();
$link.replaceWith($link.text() + ': ' + absoluteUrl + ' '); const linkText = $link.text().trim();
// Keep both the link text and the URL
$link.replaceWith(linkText + ' [' + absoluteUrl + '] ');
} }
} }
} catch (error) { } catch (error) {
@@ -99,44 +103,64 @@ export async function getProcessedText(
} }
}); });
// Get text content // Get text content
let text: string; let text: string;
// Use a simpler approach to extract text
const bodyContent = $('body'); const bodyContent = $('body');
if (bodyContent.length > 0) { if (bodyContent.length > 0) {
const bodyHtml = bodyContent.html() || ''; // Remove script and style tags that might have been missed
const minimizedBody = minifyHtml(bodyHtml); bodyContent.find('script, style, noscript').remove();
text = htmlToText(minimizedBody);
// Get text with proper spacing
text = bodyContent
.contents()
.map((_, el) => {
if (el.type === 'text') {
return $(el).text();
}
if (el.type === 'tag') {
const $el = $(el);
const tagName = el.name?.toLowerCase();
// Add appropriate spacing for block elements
if (['div', 'p', 'br', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tagName || '')) {
return $el.text() + '\n';
}
return $el.text() + ' ';
}
return '';
})
.get()
.join('');
} else { } else {
text = $.text(); text = $.text();
} }
// Clean up the text while preserving quotes
text = cleanText(text);
return text; return text;
} catch (error) { } catch (error) {
console.error('Error while getting processed text: ', error); console.error('Error while getting processed text: ', error);
return ''; // Explicitly return empty string on error return '';
} }
} }
function minifyHtml(html: string): string { // Clean up text while preserving quotes and important content
return html function cleanText(text: string): string {
.replace(/\s+/g, ' ') if (!text) return '';
.replace(/>\s+</g, '><')
return text
// Replace multiple spaces with single space, but be careful with quotes
.replace(/[^\S\n]+/g, ' ')
// Replace multiple newlines with max 2 newlines
.replace(/\n\s*\n/g, '\n\n')
// Clean up spaces around quotes but don't remove the quotes
.replace(/\s+"/g, ' "')
.replace(/"\s+/g, '" ')
// Remove leading/trailing whitespace
.trim(); .trim();
}
function htmlToText(html: string): string {
const $ = cheerio.load(html);
$('script, style, noscript').remove();
let text = $('body').text() || $.text();
text = text
.replace(/\s+/g, ' ')
.replace(/\n\s*\n/g, '\n')
.trim();
return text;
} }