fix: get important content
This commit is contained in:
@@ -1,3 +1,4 @@
|
|||||||
|
|
||||||
import * as cheerio from 'cheerio';
|
import * as cheerio from 'cheerio';
|
||||||
import { URL } from 'url';
|
import { URL } from 'url';
|
||||||
|
|
||||||
@@ -81,17 +82,20 @@ export async function getProcessedText(
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Process website links
|
// Process website links - Preserve the link text AND the URL
|
||||||
$('a[href]').each((_, element) => {
|
$('a[href]').each((_, element) => {
|
||||||
try {
|
try {
|
||||||
const $link = $(element);
|
const $link = $(element);
|
||||||
if (!keepWebpageLinks) {
|
if (!keepWebpageLinks) {
|
||||||
$link.remove();
|
// Just remove the link but keep the text
|
||||||
|
$link.replaceWith($link.text());
|
||||||
} else {
|
} else {
|
||||||
const href = $link.attr('href');
|
const href = $link.attr('href');
|
||||||
if (href) {
|
if (href) {
|
||||||
const absoluteUrl = new URL(href, baseUrl).toString();
|
const absoluteUrl = new URL(href, baseUrl).toString();
|
||||||
$link.replaceWith($link.text() + ': ' + absoluteUrl + ' ');
|
const linkText = $link.text().trim();
|
||||||
|
// Keep both the link text and the URL
|
||||||
|
$link.replaceWith(linkText + ' [' + absoluteUrl + '] ');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@@ -99,44 +103,64 @@ export async function getProcessedText(
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Get text content
|
// Get text content
|
||||||
let text: string;
|
let text: string;
|
||||||
|
|
||||||
|
// Use a simpler approach to extract text
|
||||||
const bodyContent = $('body');
|
const bodyContent = $('body');
|
||||||
|
|
||||||
if (bodyContent.length > 0) {
|
if (bodyContent.length > 0) {
|
||||||
const bodyHtml = bodyContent.html() || '';
|
// Remove script and style tags that might have been missed
|
||||||
const minimizedBody = minifyHtml(bodyHtml);
|
bodyContent.find('script, style, noscript').remove();
|
||||||
text = htmlToText(minimizedBody);
|
|
||||||
|
// Get text with proper spacing
|
||||||
|
text = bodyContent
|
||||||
|
.contents()
|
||||||
|
.map((_, el) => {
|
||||||
|
if (el.type === 'text') {
|
||||||
|
return $(el).text();
|
||||||
|
}
|
||||||
|
if (el.type === 'tag') {
|
||||||
|
const $el = $(el);
|
||||||
|
const tagName = el.name?.toLowerCase();
|
||||||
|
|
||||||
|
// Add appropriate spacing for block elements
|
||||||
|
if (['div', 'p', 'br', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tagName || '')) {
|
||||||
|
return $el.text() + '\n';
|
||||||
|
}
|
||||||
|
return $el.text() + ' ';
|
||||||
|
}
|
||||||
|
return '';
|
||||||
|
})
|
||||||
|
.get()
|
||||||
|
.join('');
|
||||||
} else {
|
} else {
|
||||||
text = $.text();
|
text = $.text();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clean up the text while preserving quotes
|
||||||
|
text = cleanText(text);
|
||||||
|
|
||||||
return text;
|
return text;
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error while getting processed text: ', error);
|
console.error('Error while getting processed text: ', error);
|
||||||
return ''; // Explicitly return empty string on error
|
return '';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function minifyHtml(html: string): string {
|
// Clean up text while preserving quotes and important content
|
||||||
return html
|
function cleanText(text: string): string {
|
||||||
.replace(/\s+/g, ' ')
|
if (!text) return '';
|
||||||
.replace(/>\s+</g, '><')
|
|
||||||
|
return text
|
||||||
|
// Replace multiple spaces with single space, but be careful with quotes
|
||||||
|
.replace(/[^\S\n]+/g, ' ')
|
||||||
|
// Replace multiple newlines with max 2 newlines
|
||||||
|
.replace(/\n\s*\n/g, '\n\n')
|
||||||
|
// Clean up spaces around quotes but don't remove the quotes
|
||||||
|
.replace(/\s+"/g, ' "')
|
||||||
|
.replace(/"\s+/g, '" ')
|
||||||
|
// Remove leading/trailing whitespace
|
||||||
.trim();
|
.trim();
|
||||||
}
|
|
||||||
|
|
||||||
function htmlToText(html: string): string {
|
|
||||||
const $ = cheerio.load(html);
|
|
||||||
|
|
||||||
$('script, style, noscript').remove();
|
|
||||||
|
|
||||||
let text = $('body').text() || $.text();
|
|
||||||
|
|
||||||
text = text
|
|
||||||
.replace(/\s+/g, ' ')
|
|
||||||
.replace(/\n\s*\n/g, '\n')
|
|
||||||
.trim();
|
|
||||||
|
|
||||||
return text;
|
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user