fix: return empty empty str on error
This commit is contained in:
@@ -1,7 +1,4 @@
|
|||||||
// SPDX-License-Identifier: MIT
|
|
||||||
|
|
||||||
import * as cheerio from 'cheerio';
|
import * as cheerio from 'cheerio';
|
||||||
import { AnyAuthClient } from 'node_modules/google-auth-library/build/src';
|
|
||||||
import { URL } from 'url';
|
import { URL } from 'url';
|
||||||
|
|
||||||
export interface ProcessTextOptions {
|
export interface ProcessTextOptions {
|
||||||
@@ -54,7 +51,7 @@ export async function getProcessedText(
|
|||||||
|
|
||||||
const uniqueImageTypes = [...new Set(imageTypesToRemove)];
|
const uniqueImageTypes = [...new Set(imageTypesToRemove)];
|
||||||
|
|
||||||
$('img').each((_: any, element: any) => {
|
$('img').each((_, element) => {
|
||||||
try {
|
try {
|
||||||
const $img = $(element);
|
const $img = $(element);
|
||||||
if (!keepImages) {
|
if (!keepImages) {
|
||||||
@@ -85,7 +82,7 @@ export async function getProcessedText(
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Process website links
|
// Process website links
|
||||||
$('a[href]').each((_: any, element: any) => {
|
$('a[href]').each((_, element) => {
|
||||||
try {
|
try {
|
||||||
const $link = $(element);
|
const $link = $(element);
|
||||||
if (!keepWebpageLinks) {
|
if (!keepWebpageLinks) {
|
||||||
@@ -107,7 +104,6 @@ export async function getProcessedText(
|
|||||||
const bodyContent = $('body');
|
const bodyContent = $('body');
|
||||||
|
|
||||||
if (bodyContent.length > 0) {
|
if (bodyContent.length > 0) {
|
||||||
// For minification, we'll use a simple approach to clean up the HTML
|
|
||||||
const bodyHtml = bodyContent.html() || '';
|
const bodyHtml = bodyContent.html() || '';
|
||||||
const minimizedBody = minifyHtml(bodyHtml);
|
const minimizedBody = minifyHtml(bodyHtml);
|
||||||
text = htmlToText(minimizedBody);
|
text = htmlToText(minimizedBody);
|
||||||
@@ -119,11 +115,10 @@ export async function getProcessedText(
|
|||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error while getting processed text: ', error);
|
console.error('Error while getting processed text: ', error);
|
||||||
return '';
|
return ''; // Explicitly return empty string on error
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Simple HTML minification function
|
|
||||||
function minifyHtml(html: string): string {
|
function minifyHtml(html: string): string {
|
||||||
return html
|
return html
|
||||||
.replace(/\s+/g, ' ')
|
.replace(/\s+/g, ' ')
|
||||||
@@ -131,17 +126,13 @@ function minifyHtml(html: string): string {
|
|||||||
.trim();
|
.trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert HTML to text (simplified version of inscriptis functionality)
|
|
||||||
function htmlToText(html: string): string {
|
function htmlToText(html: string): string {
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
// Remove elements that shouldn't contribute to text
|
|
||||||
$('script, style, noscript').remove();
|
$('script, style, noscript').remove();
|
||||||
|
|
||||||
// Get text content with basic formatting
|
|
||||||
let text = $('body').text() || $.text();
|
let text = $('body').text() || $.text();
|
||||||
|
|
||||||
// Clean up the text
|
|
||||||
text = text
|
text = text
|
||||||
.replace(/\s+/g, ' ')
|
.replace(/\s+/g, ' ')
|
||||||
.replace(/\n\s*\n/g, '\n')
|
.replace(/\n\s*\n/g, '\n')
|
||||||
|
|||||||
Reference in New Issue
Block a user