From 01a26783580d10b6f8045a4084ec6388ac52b577 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 17 Nov 2025 19:45:11 +0530 Subject: [PATCH] wip: get html --- server/src/markdownify/get_html.ts | 53 ++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 server/src/markdownify/get_html.ts diff --git a/server/src/markdownify/get_html.ts b/server/src/markdownify/get_html.ts new file mode 100644 index 00000000..ea4c6788 --- /dev/null +++ b/server/src/markdownify/get_html.ts @@ -0,0 +1,53 @@ +import { chromium, Browser, Page } from 'playwright'; + +export interface GetPageSourceOptions { + wait?: number; + headless?: boolean; + userAgent?: string; +} + +export async function getPageSource( + url: string, + options: GetPageSourceOptions = {} +): Promise { + const { + wait = 1.5, + headless = true, + userAgent = "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 640 XL LTE) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Mobile Safari/537.36 Edge/12.10166" + } = options; + + let browser: Browser | null = null; + let page: Page | null = null; + + try { + browser = await chromium.launch({ + headless, + args: ['--no-sandbox', '--disable-dev-shm-usage'] + }); + + page = await browser.newPage(); + await page.setUserAgent(userAgent); + + // Convert wait time to milliseconds + const waitMs = wait * 1000; + + // Set default timeout and navigate to URL + await page.setDefaultTimeout(waitMs); + await page.goto(url, { waitUntil: 'domcontentloaded' }); + + // Wait for additional time if specified + if (waitMs > 0) { + await page.waitForTimeout(waitMs); + } + + const pageSource = await page.content(); + return pageSource; + + } catch (error) { + console.error('Error while getting page source: ', error); + return ''; + } finally { + if (page) await page.close(); + if (browser) await browser.close(); + } +} \ No newline at end of file