From b14d84d83ab24f9c35cbca7da1d05165af8e9b53 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 20 Nov 2025 03:51:53 +0530 Subject: [PATCH] fix: -rm debug turndown --- server/src/markdownify/debug_turndown.ts | 132 ----------------------- 1 file changed, 132 deletions(-) delete mode 100644 server/src/markdownify/debug_turndown.ts diff --git a/server/src/markdownify/debug_turndown.ts b/server/src/markdownify/debug_turndown.ts deleted file mode 100644 index 1d62b109..00000000 --- a/server/src/markdownify/debug_turndown.ts +++ /dev/null @@ -1,132 +0,0 @@ -import { getPageSource } from './get_html'; -import { getProcessedText } from './get_llm_input_text'; -import * as cheerio from 'cheerio'; -import TurndownService from 'turndown'; - -async function debugTurndown() { - const testUrls = [ - "https://amazon.com/", - ]; - - for (const url of testUrls) { - console.log(`\n${'='.repeat(70)}`); - console.log(`๐Ÿ” Testing URL: ${url}`); - console.log(`${'='.repeat(70)}`); - - try { - const pageSource = await getPageSource(url, { - wait: 3.0, // Longer wait time - timeout: 15000 // 15 second timeout - }); - - if (!pageSource || pageSource.length < 100) { - console.error("โŒ No page source received or content too short"); - continue; - } - - // Save raw HTML for inspection - const fs = await import('fs/promises'); - const domain = new URL(url).hostname; - await fs.writeFile(`debug_${domain}_raw.html`, pageSource); - console.log(`๐Ÿ’พ Raw HTML saved to debug_${domain}_raw.html (${pageSource.length} chars)`); - - // Parse with cheerio - const $ = cheerio.load(pageSource); - - // Check what's in the body - const bodyText = $('body').text(); - console.log(`๐Ÿ“„ Body text length: ${bodyText.length} chars`); - console.log(`๐Ÿ“„ Body preview: ${bodyText.substring(0, 200)}...`); - - // Test content extraction - const contentSelectors = [ - 'main', 'article', '[role="main"]', '.content', '.main-content', - '#content', '#main', '.post', '.article' - ]; - - let mainContent: cheerio.Cheerio = $('body'); - let foundSelector = 'body (fallback)'; - - for (const selector of contentSelectors) { - const $content = $(selector).first(); - if ($content.length > 0 && $content.text().trim().length > 10) { - console.log(`โœ… Found content with selector: ${selector}`); - console.log(`๐Ÿ“ Content text length: ${$content.text().length}`); - mainContent = $content; - foundSelector = selector; - break; - } - } - - console.log(`๐ŸŽฏ Using content from: ${foundSelector}`); - - // Test Turndown directly - console.log("\n๐Ÿงช Testing Turndown directly..."); - const turndownService = new TurndownService(); - - if (mainContent.length > 0) { - const contentHtml = mainContent.html() || ''; - if (contentHtml && contentHtml.length > 10) { - console.log(`๐Ÿ“ฆ Content HTML length: ${contentHtml.length} chars`); - - try { - const contentMarkdown = turndownService.turndown(contentHtml); - console.log(`๐Ÿ“ Turndown result length: ${contentMarkdown.length} chars`); - - if (contentMarkdown.length > 0) { - console.log(`๐Ÿ“ Markdown preview: ${contentMarkdown.substring(0, 300)}...`); - await fs.writeFile(`debug_${domain}_turndown.md`, contentMarkdown); - console.log(`๐Ÿ’พ Turndown output saved to debug_${domain}_turndown.md`); - } else { - console.log("โŒ Turndown produced empty markdown"); - } - } catch (turndownError) { - console.error("โŒ Turndown conversion failed:", turndownError); - } - } else { - console.log("โŒ No HTML content found for Turndown"); - } - } - - // Test our full function - console.log("\n๐Ÿงช Testing full getProcessedText function..."); - const result = await getProcessedText(pageSource, url, { - keepImages: true, - keepWebpageLinks: true, - removeScriptTag: true, - removeStyleTag: true, - formatAsMarkdown: true - }); - - console.log("๐Ÿ“Š Result metadata:"); - console.log(`- Markdown length: ${result.metadata.markdownLength} chars`); - console.log(`- Plain text length: ${result.metadata.textLength} chars`); - console.log(`- Has content: ${result.metadata.hasContent}`); - console.log(`- Content score: ${result.metadata.contentScore}/10`); - - if (result.markdown && result.markdown.length > 0) { - console.log(`๐Ÿ“„ Markdown preview (300 chars):`); - console.log(result.markdown.substring(0, 300) + '...'); - await fs.writeFile(`debug_${domain}_full.md`, result.markdown); - console.log(`๐Ÿ’พ Full output saved to debug_${domain}_full.md`); - } else { - console.log("โŒ Empty markdown from full function"); - - // Debug why it's empty - if (result.plainText && result.plainText.length > 0) { - console.log("โ„น๏ธ But plain text has content, so markdown conversion failed"); - await fs.writeFile(`debug_${domain}_plain.txt`, result.plainText); - console.log(`๐Ÿ’พ Plain text saved to debug_${domain}_plain.txt`); - } - } - - } catch (error) { - console.error(`๐Ÿ’ฅ Error processing ${url}:`, error); - } - - // Small delay between requests - await new Promise(resolve => setTimeout(resolve, 1000)); - } -} - -debugTurndown().catch(console.error); \ No newline at end of file