fix: required page instance scrape action

This commit is contained in:
Rohit Rajan
2025-11-30 20:10:25 +05:30
parent 2e6e89e453
commit eb512d8df3
3 changed files with 82 additions and 93 deletions

View File

@@ -550,9 +550,9 @@ export class RemoteBrowser {
try { try {
const blocker = await PlaywrightBlocker.fromLists(fetch, ['https://easylist.to/easylist/easylist.txt']); const blocker = await PlaywrightBlocker.fromLists(fetch, ['https://easylist.to/easylist/easylist.txt']);
await blocker.enableBlockingInPage(this.currentPage); await blocker.enableBlockingInPage(this.currentPage as any);
this.client = await this.currentPage.context().newCDPSession(this.currentPage); this.client = await this.currentPage.context().newCDPSession(this.currentPage);
await blocker.disableBlockingInPage(this.currentPage); await blocker.disableBlockingInPage(this.currentPage as any);
console.log('Adblocker initialized'); console.log('Adblocker initialized');
} catch (error: any) { } catch (error: any) {
console.warn('Failed to initialize adblocker, continuing without it:', error.message); console.warn('Failed to initialize adblocker, continuing without it:', error.message);

View File

@@ -1,4 +1,4 @@
import { connectToRemoteBrowser } from "../browser-management/browserConnection"; import { Page } from "playwright-core";
import { parseMarkdown } from "./markdown"; import { parseMarkdown } from "./markdown";
import logger from "../logger"; import logger from "../logger";
@@ -21,13 +21,13 @@ async function gotoWithFallback(page: any, url: string) {
* Fetches a webpage, strips scripts/styles/images/etc, * Fetches a webpage, strips scripts/styles/images/etc,
* returns clean Markdown using parser. * returns clean Markdown using parser.
* @param url - The URL to convert * @param url - The URL to convert
* @param existingPage - Optional existing Playwright page instance to reuse * @param page - Existing Playwright page instance to use
*/ */
export async function convertPageToMarkdown(url: string): Promise<string> { export async function convertPageToMarkdown(url: string, page: Page): Promise<string> {
const browser = await connectToRemoteBrowser(); try {
const page = await browser.newPage(); logger.log('info', `[Scrape] Using existing page instance for markdown conversion of ${url}`);
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); await gotoWithFallback(page, url);
const cleanedHtml = await page.evaluate(() => { const cleanedHtml = await page.evaluate(() => {
const selectors = [ const selectors = [
@@ -51,7 +51,6 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
document.querySelectorAll(sel).forEach(e => e.remove()); document.querySelectorAll(sel).forEach(e => e.remove());
}); });
// Remove inline event handlers (onclick, onload…)
const all = document.querySelectorAll("*"); const all = document.querySelectorAll("*");
all.forEach(el => { all.forEach(el => {
[...el.attributes].forEach(attr => { [...el.attributes].forEach(attr => {
@@ -64,29 +63,25 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
return document.documentElement.outerHTML; return document.documentElement.outerHTML;
}); });
if (shouldCloseBrowser && browser) {
logger.log('info', `[Scrape] Closing browser instance created for markdown conversion`);
await browser.close();
} else {
logger.log('info', `[Scrape] Keeping existing browser instance open after markdown conversion`);
}
// Convert cleaned HTML → Markdown
const markdown = await parseMarkdown(cleanedHtml, url); const markdown = await parseMarkdown(cleanedHtml, url);
return markdown; return markdown;
} catch (error: any) {
logger.error(`[Scrape] Error during markdown conversion: ${error.message}`);
throw error;
}
} }
/** /**
* Fetches a webpage, strips scripts/styles/images/etc, * Fetches a webpage, strips scripts/styles/images/etc,
* returns clean HTML. * returns clean HTML.
* @param url - The URL to convert * @param url - The URL to convert
* @param existingPage - Optional existing Playwright page instance to reuse * @param page - Existing Playwright page instance to use
*/ */
export async function convertPageToHTML(url: string): Promise<string> { export async function convertPageToHTML(url: string, page: Page): Promise<string> {
const browser = await connectToRemoteBrowser(); try {
const page = await browser.newPage(); logger.log('info', `[Scrape] Using existing page instance for HTML conversion of ${url}`);
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); await gotoWithFallback(page, url);
const cleanedHtml = await page.evaluate(() => { const cleanedHtml = await page.evaluate(() => {
const selectors = [ const selectors = [
@@ -110,7 +105,6 @@ export async function convertPageToHTML(url: string): Promise<string> {
document.querySelectorAll(sel).forEach(e => e.remove()); document.querySelectorAll(sel).forEach(e => e.remove());
}); });
// Remove inline event handlers (onclick, onload…)
const all = document.querySelectorAll("*"); const all = document.querySelectorAll("*");
all.forEach(el => { all.forEach(el => {
[...el.attributes].forEach(attr => { [...el.attributes].forEach(attr => {
@@ -123,13 +117,9 @@ export async function convertPageToHTML(url: string): Promise<string> {
return document.documentElement.outerHTML; return document.documentElement.outerHTML;
}); });
if (shouldCloseBrowser && browser) {
logger.log('info', `[Scrape] Closing browser instance created for HTML conversion`);
await browser.close();
} else {
logger.log('info', `[Scrape] Keeping existing browser instance open after HTML conversion`);
}
// Return cleaned HTML directly
return cleanedHtml; return cleanedHtml;
} catch (error: any) {
logger.error(`[Scrape] Error during HTML conversion: ${error.message}`);
throw error;
}
} }

View File

@@ -15,7 +15,6 @@ import { encrypt, decrypt } from '../utils/auth';
import { WorkflowFile } from 'maxun-core'; import { WorkflowFile } from 'maxun-core';
import { cancelScheduledWorkflow, scheduleWorkflow } from '../storage/schedule'; import { cancelScheduledWorkflow, scheduleWorkflow } from '../storage/schedule';
import { pgBossClient } from '../storage/pgboss'; import { pgBossClient } from '../storage/pgboss';
chromium.use(stealthPlugin());
export const router = Router(); export const router = Router();