fix: required page instance scrape action
This commit is contained in:
@@ -550,9 +550,9 @@ export class RemoteBrowser {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
const blocker = await PlaywrightBlocker.fromLists(fetch, ['https://easylist.to/easylist/easylist.txt']);
|
const blocker = await PlaywrightBlocker.fromLists(fetch, ['https://easylist.to/easylist/easylist.txt']);
|
||||||
await blocker.enableBlockingInPage(this.currentPage);
|
await blocker.enableBlockingInPage(this.currentPage as any);
|
||||||
this.client = await this.currentPage.context().newCDPSession(this.currentPage);
|
this.client = await this.currentPage.context().newCDPSession(this.currentPage);
|
||||||
await blocker.disableBlockingInPage(this.currentPage);
|
await blocker.disableBlockingInPage(this.currentPage as any);
|
||||||
console.log('Adblocker initialized');
|
console.log('Adblocker initialized');
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
console.warn('Failed to initialize adblocker, continuing without it:', error.message);
|
console.warn('Failed to initialize adblocker, continuing without it:', error.message);
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
import { connectToRemoteBrowser } from "../browser-management/browserConnection";
|
import { Page } from "playwright-core";
|
||||||
import { parseMarkdown } from "./markdown";
|
import { parseMarkdown } from "./markdown";
|
||||||
import logger from "../logger";
|
import logger from "../logger";
|
||||||
|
|
||||||
@@ -21,115 +21,105 @@ async function gotoWithFallback(page: any, url: string) {
|
|||||||
* Fetches a webpage, strips scripts/styles/images/etc,
|
* Fetches a webpage, strips scripts/styles/images/etc,
|
||||||
* returns clean Markdown using parser.
|
* returns clean Markdown using parser.
|
||||||
* @param url - The URL to convert
|
* @param url - The URL to convert
|
||||||
* @param existingPage - Optional existing Playwright page instance to reuse
|
* @param page - Existing Playwright page instance to use
|
||||||
*/
|
*/
|
||||||
export async function convertPageToMarkdown(url: string): Promise<string> {
|
export async function convertPageToMarkdown(url: string, page: Page): Promise<string> {
|
||||||
const browser = await connectToRemoteBrowser();
|
try {
|
||||||
const page = await browser.newPage();
|
logger.log('info', `[Scrape] Using existing page instance for markdown conversion of ${url}`);
|
||||||
|
|
||||||
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
|
await gotoWithFallback(page, url);
|
||||||
|
|
||||||
const cleanedHtml = await page.evaluate(() => {
|
const cleanedHtml = await page.evaluate(() => {
|
||||||
const selectors = [
|
const selectors = [
|
||||||
"script",
|
"script",
|
||||||
"style",
|
"style",
|
||||||
"link[rel='stylesheet']",
|
"link[rel='stylesheet']",
|
||||||
"noscript",
|
"noscript",
|
||||||
"meta",
|
"meta",
|
||||||
"svg",
|
"svg",
|
||||||
"img",
|
"img",
|
||||||
"picture",
|
"picture",
|
||||||
"source",
|
"source",
|
||||||
"video",
|
"video",
|
||||||
"audio",
|
"audio",
|
||||||
"iframe",
|
"iframe",
|
||||||
"object",
|
"object",
|
||||||
"embed"
|
"embed"
|
||||||
];
|
];
|
||||||
|
|
||||||
selectors.forEach(sel => {
|
selectors.forEach(sel => {
|
||||||
document.querySelectorAll(sel).forEach(e => e.remove());
|
document.querySelectorAll(sel).forEach(e => e.remove());
|
||||||
});
|
|
||||||
|
|
||||||
// Remove inline event handlers (onclick, onload…)
|
|
||||||
const all = document.querySelectorAll("*");
|
|
||||||
all.forEach(el => {
|
|
||||||
[...el.attributes].forEach(attr => {
|
|
||||||
if (attr.name.startsWith("on")) {
|
|
||||||
el.removeAttribute(attr.name);
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const all = document.querySelectorAll("*");
|
||||||
|
all.forEach(el => {
|
||||||
|
[...el.attributes].forEach(attr => {
|
||||||
|
if (attr.name.startsWith("on")) {
|
||||||
|
el.removeAttribute(attr.name);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return document.documentElement.outerHTML;
|
||||||
});
|
});
|
||||||
|
|
||||||
return document.documentElement.outerHTML;
|
const markdown = await parseMarkdown(cleanedHtml, url);
|
||||||
});
|
return markdown;
|
||||||
|
} catch (error: any) {
|
||||||
if (shouldCloseBrowser && browser) {
|
logger.error(`[Scrape] Error during markdown conversion: ${error.message}`);
|
||||||
logger.log('info', `[Scrape] Closing browser instance created for markdown conversion`);
|
throw error;
|
||||||
await browser.close();
|
|
||||||
} else {
|
|
||||||
logger.log('info', `[Scrape] Keeping existing browser instance open after markdown conversion`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert cleaned HTML → Markdown
|
|
||||||
const markdown = await parseMarkdown(cleanedHtml, url);
|
|
||||||
return markdown;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetches a webpage, strips scripts/styles/images/etc,
|
* Fetches a webpage, strips scripts/styles/images/etc,
|
||||||
* returns clean HTML.
|
* returns clean HTML.
|
||||||
* @param url - The URL to convert
|
* @param url - The URL to convert
|
||||||
* @param existingPage - Optional existing Playwright page instance to reuse
|
* @param page - Existing Playwright page instance to use
|
||||||
*/
|
*/
|
||||||
export async function convertPageToHTML(url: string): Promise<string> {
|
export async function convertPageToHTML(url: string, page: Page): Promise<string> {
|
||||||
const browser = await connectToRemoteBrowser();
|
try {
|
||||||
const page = await browser.newPage();
|
logger.log('info', `[Scrape] Using existing page instance for HTML conversion of ${url}`);
|
||||||
|
|
||||||
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
|
await gotoWithFallback(page, url);
|
||||||
|
|
||||||
const cleanedHtml = await page.evaluate(() => {
|
const cleanedHtml = await page.evaluate(() => {
|
||||||
const selectors = [
|
const selectors = [
|
||||||
"script",
|
"script",
|
||||||
"style",
|
"style",
|
||||||
"link[rel='stylesheet']",
|
"link[rel='stylesheet']",
|
||||||
"noscript",
|
"noscript",
|
||||||
"meta",
|
"meta",
|
||||||
"svg",
|
"svg",
|
||||||
"img",
|
"img",
|
||||||
"picture",
|
"picture",
|
||||||
"source",
|
"source",
|
||||||
"video",
|
"video",
|
||||||
"audio",
|
"audio",
|
||||||
"iframe",
|
"iframe",
|
||||||
"object",
|
"object",
|
||||||
"embed"
|
"embed"
|
||||||
];
|
];
|
||||||
|
|
||||||
selectors.forEach(sel => {
|
selectors.forEach(sel => {
|
||||||
document.querySelectorAll(sel).forEach(e => e.remove());
|
document.querySelectorAll(sel).forEach(e => e.remove());
|
||||||
});
|
|
||||||
|
|
||||||
// Remove inline event handlers (onclick, onload…)
|
|
||||||
const all = document.querySelectorAll("*");
|
|
||||||
all.forEach(el => {
|
|
||||||
[...el.attributes].forEach(attr => {
|
|
||||||
if (attr.name.startsWith("on")) {
|
|
||||||
el.removeAttribute(attr.name);
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const all = document.querySelectorAll("*");
|
||||||
|
all.forEach(el => {
|
||||||
|
[...el.attributes].forEach(attr => {
|
||||||
|
if (attr.name.startsWith("on")) {
|
||||||
|
el.removeAttribute(attr.name);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return document.documentElement.outerHTML;
|
||||||
});
|
});
|
||||||
|
|
||||||
return document.documentElement.outerHTML;
|
return cleanedHtml;
|
||||||
});
|
} catch (error: any) {
|
||||||
|
logger.error(`[Scrape] Error during HTML conversion: ${error.message}`);
|
||||||
if (shouldCloseBrowser && browser) {
|
throw error;
|
||||||
logger.log('info', `[Scrape] Closing browser instance created for HTML conversion`);
|
|
||||||
await browser.close();
|
|
||||||
} else {
|
|
||||||
logger.log('info', `[Scrape] Keeping existing browser instance open after HTML conversion`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return cleaned HTML directly
|
|
||||||
return cleanedHtml;
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,7 +15,6 @@ import { encrypt, decrypt } from '../utils/auth';
|
|||||||
import { WorkflowFile } from 'maxun-core';
|
import { WorkflowFile } from 'maxun-core';
|
||||||
import { cancelScheduledWorkflow, scheduleWorkflow } from '../storage/schedule';
|
import { cancelScheduledWorkflow, scheduleWorkflow } from '../storage/schedule';
|
||||||
import { pgBossClient } from '../storage/pgboss';
|
import { pgBossClient } from '../storage/pgboss';
|
||||||
chromium.use(stealthPlugin());
|
|
||||||
|
|
||||||
export const router = Router();
|
export const router = Router();
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user