diff --git a/server/src/browser-management/classes/RemoteBrowser.ts b/server/src/browser-management/classes/RemoteBrowser.ts index 266a0978..41a59176 100644 --- a/server/src/browser-management/classes/RemoteBrowser.ts +++ b/server/src/browser-management/classes/RemoteBrowser.ts @@ -550,9 +550,9 @@ export class RemoteBrowser { try { const blocker = await PlaywrightBlocker.fromLists(fetch, ['https://easylist.to/easylist/easylist.txt']); - await blocker.enableBlockingInPage(this.currentPage); + await blocker.enableBlockingInPage(this.currentPage as any); this.client = await this.currentPage.context().newCDPSession(this.currentPage); - await blocker.disableBlockingInPage(this.currentPage); + await blocker.disableBlockingInPage(this.currentPage as any); console.log('Adblocker initialized'); } catch (error: any) { console.warn('Failed to initialize adblocker, continuing without it:', error.message); diff --git a/server/src/markdownify/scrape.ts b/server/src/markdownify/scrape.ts index 52ae19bf..09df4276 100644 --- a/server/src/markdownify/scrape.ts +++ b/server/src/markdownify/scrape.ts @@ -1,4 +1,4 @@ -import { connectToRemoteBrowser } from "../browser-management/browserConnection"; +import { Page } from "playwright-core"; import { parseMarkdown } from "./markdown"; import logger from "../logger"; @@ -21,115 +21,105 @@ async function gotoWithFallback(page: any, url: string) { * Fetches a webpage, strips scripts/styles/images/etc, * returns clean Markdown using parser. * @param url - The URL to convert - * @param existingPage - Optional existing Playwright page instance to reuse + * @param page - Existing Playwright page instance to use */ -export async function convertPageToMarkdown(url: string): Promise { - const browser = await connectToRemoteBrowser(); - const page = await browser.newPage(); +export async function convertPageToMarkdown(url: string, page: Page): Promise { + try { + logger.log('info', `[Scrape] Using existing page instance for markdown conversion of ${url}`); - await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); + await gotoWithFallback(page, url); - const cleanedHtml = await page.evaluate(() => { - const selectors = [ - "script", - "style", - "link[rel='stylesheet']", - "noscript", - "meta", - "svg", - "img", - "picture", - "source", - "video", - "audio", - "iframe", - "object", - "embed" - ]; + const cleanedHtml = await page.evaluate(() => { + const selectors = [ + "script", + "style", + "link[rel='stylesheet']", + "noscript", + "meta", + "svg", + "img", + "picture", + "source", + "video", + "audio", + "iframe", + "object", + "embed" + ]; - selectors.forEach(sel => { - document.querySelectorAll(sel).forEach(e => e.remove()); - }); - - // Remove inline event handlers (onclick, onload…) - const all = document.querySelectorAll("*"); - all.forEach(el => { - [...el.attributes].forEach(attr => { - if (attr.name.startsWith("on")) { - el.removeAttribute(attr.name); - } + selectors.forEach(sel => { + document.querySelectorAll(sel).forEach(e => e.remove()); }); + + const all = document.querySelectorAll("*"); + all.forEach(el => { + [...el.attributes].forEach(attr => { + if (attr.name.startsWith("on")) { + el.removeAttribute(attr.name); + } + }); + }); + + return document.documentElement.outerHTML; }); - return document.documentElement.outerHTML; - }); - - if (shouldCloseBrowser && browser) { - logger.log('info', `[Scrape] Closing browser instance created for markdown conversion`); - await browser.close(); - } else { - logger.log('info', `[Scrape] Keeping existing browser instance open after markdown conversion`); + const markdown = await parseMarkdown(cleanedHtml, url); + return markdown; + } catch (error: any) { + logger.error(`[Scrape] Error during markdown conversion: ${error.message}`); + throw error; } - - // Convert cleaned HTML → Markdown - const markdown = await parseMarkdown(cleanedHtml, url); - return markdown; } /** * Fetches a webpage, strips scripts/styles/images/etc, * returns clean HTML. * @param url - The URL to convert - * @param existingPage - Optional existing Playwright page instance to reuse + * @param page - Existing Playwright page instance to use */ -export async function convertPageToHTML(url: string): Promise { - const browser = await connectToRemoteBrowser(); - const page = await browser.newPage(); +export async function convertPageToHTML(url: string, page: Page): Promise { + try { + logger.log('info', `[Scrape] Using existing page instance for HTML conversion of ${url}`); - await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); + await gotoWithFallback(page, url); - const cleanedHtml = await page.evaluate(() => { - const selectors = [ - "script", - "style", - "link[rel='stylesheet']", - "noscript", - "meta", - "svg", - "img", - "picture", - "source", - "video", - "audio", - "iframe", - "object", - "embed" - ]; + const cleanedHtml = await page.evaluate(() => { + const selectors = [ + "script", + "style", + "link[rel='stylesheet']", + "noscript", + "meta", + "svg", + "img", + "picture", + "source", + "video", + "audio", + "iframe", + "object", + "embed" + ]; - selectors.forEach(sel => { - document.querySelectorAll(sel).forEach(e => e.remove()); - }); - - // Remove inline event handlers (onclick, onload…) - const all = document.querySelectorAll("*"); - all.forEach(el => { - [...el.attributes].forEach(attr => { - if (attr.name.startsWith("on")) { - el.removeAttribute(attr.name); - } + selectors.forEach(sel => { + document.querySelectorAll(sel).forEach(e => e.remove()); }); + + const all = document.querySelectorAll("*"); + all.forEach(el => { + [...el.attributes].forEach(attr => { + if (attr.name.startsWith("on")) { + el.removeAttribute(attr.name); + } + }); + }); + + return document.documentElement.outerHTML; }); - return document.documentElement.outerHTML; - }); - - if (shouldCloseBrowser && browser) { - logger.log('info', `[Scrape] Closing browser instance created for HTML conversion`); - await browser.close(); - } else { - logger.log('info', `[Scrape] Keeping existing browser instance open after HTML conversion`); + return cleanedHtml; + } catch (error: any) { + logger.error(`[Scrape] Error during HTML conversion: ${error.message}`); + throw error; } - - // Return cleaned HTML directly - return cleanedHtml; } diff --git a/server/src/routes/storage.ts b/server/src/routes/storage.ts index 72518c7b..45d4bc53 100644 --- a/server/src/routes/storage.ts +++ b/server/src/routes/storage.ts @@ -15,7 +15,6 @@ import { encrypt, decrypt } from '../utils/auth'; import { WorkflowFile } from 'maxun-core'; import { cancelScheduledWorkflow, scheduleWorkflow } from '../storage/schedule'; import { pgBossClient } from '../storage/pgboss'; -chromium.use(stealthPlugin()); export const router = Router();