debug(temporary): turndown x amzn
This commit is contained in:
132
server/src/markdownify/debug_turndown.ts
Normal file
132
server/src/markdownify/debug_turndown.ts
Normal file
@@ -0,0 +1,132 @@
|
||||
import { getPageSource } from './get_html';
|
||||
import { getProcessedText } from './get_llm_input_text';
|
||||
import * as cheerio from 'cheerio';
|
||||
import TurndownService from 'turndown';
|
||||
|
||||
async function debugTurndown() {
|
||||
const testUrls = [
|
||||
"https://amazon.com/",
|
||||
];
|
||||
|
||||
for (const url of testUrls) {
|
||||
console.log(`\n${'='.repeat(70)}`);
|
||||
console.log(`🔍 Testing URL: ${url}`);
|
||||
console.log(`${'='.repeat(70)}`);
|
||||
|
||||
try {
|
||||
const pageSource = await getPageSource(url, {
|
||||
wait: 3.0, // Longer wait time
|
||||
timeout: 15000 // 15 second timeout
|
||||
});
|
||||
|
||||
if (!pageSource || pageSource.length < 100) {
|
||||
console.error("❌ No page source received or content too short");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Save raw HTML for inspection
|
||||
const fs = await import('fs/promises');
|
||||
const domain = new URL(url).hostname;
|
||||
await fs.writeFile(`debug_${domain}_raw.html`, pageSource);
|
||||
console.log(`💾 Raw HTML saved to debug_${domain}_raw.html (${pageSource.length} chars)`);
|
||||
|
||||
// Parse with cheerio
|
||||
const $ = cheerio.load(pageSource);
|
||||
|
||||
// Check what's in the body
|
||||
const bodyText = $('body').text();
|
||||
console.log(`📄 Body text length: ${bodyText.length} chars`);
|
||||
console.log(`📄 Body preview: ${bodyText.substring(0, 200)}...`);
|
||||
|
||||
// Test content extraction
|
||||
const contentSelectors = [
|
||||
'main', 'article', '[role="main"]', '.content', '.main-content',
|
||||
'#content', '#main', '.post', '.article'
|
||||
];
|
||||
|
||||
let mainContent: cheerio.Cheerio<any> = $('body');
|
||||
let foundSelector = 'body (fallback)';
|
||||
|
||||
for (const selector of contentSelectors) {
|
||||
const $content = $(selector).first();
|
||||
if ($content.length > 0 && $content.text().trim().length > 10) {
|
||||
console.log(`✅ Found content with selector: ${selector}`);
|
||||
console.log(`📝 Content text length: ${$content.text().length}`);
|
||||
mainContent = $content;
|
||||
foundSelector = selector;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`🎯 Using content from: ${foundSelector}`);
|
||||
|
||||
// Test Turndown directly
|
||||
console.log("\n🧪 Testing Turndown directly...");
|
||||
const turndownService = new TurndownService();
|
||||
|
||||
if (mainContent.length > 0) {
|
||||
const contentHtml = mainContent.html() || '';
|
||||
if (contentHtml && contentHtml.length > 10) {
|
||||
console.log(`📦 Content HTML length: ${contentHtml.length} chars`);
|
||||
|
||||
try {
|
||||
const contentMarkdown = turndownService.turndown(contentHtml);
|
||||
console.log(`📝 Turndown result length: ${contentMarkdown.length} chars`);
|
||||
|
||||
if (contentMarkdown.length > 0) {
|
||||
console.log(`📝 Markdown preview: ${contentMarkdown.substring(0, 300)}...`);
|
||||
await fs.writeFile(`debug_${domain}_turndown.md`, contentMarkdown);
|
||||
console.log(`💾 Turndown output saved to debug_${domain}_turndown.md`);
|
||||
} else {
|
||||
console.log("❌ Turndown produced empty markdown");
|
||||
}
|
||||
} catch (turndownError) {
|
||||
console.error("❌ Turndown conversion failed:", turndownError);
|
||||
}
|
||||
} else {
|
||||
console.log("❌ No HTML content found for Turndown");
|
||||
}
|
||||
}
|
||||
|
||||
// Test our full function
|
||||
console.log("\n🧪 Testing full getProcessedText function...");
|
||||
const result = await getProcessedText(pageSource, url, {
|
||||
keepImages: true,
|
||||
keepWebpageLinks: true,
|
||||
removeScriptTag: true,
|
||||
removeStyleTag: true,
|
||||
formatAsMarkdown: true
|
||||
});
|
||||
|
||||
console.log("📊 Result metadata:");
|
||||
console.log(`- Markdown length: ${result.metadata.markdownLength} chars`);
|
||||
console.log(`- Plain text length: ${result.metadata.textLength} chars`);
|
||||
console.log(`- Has content: ${result.metadata.hasContent}`);
|
||||
console.log(`- Content score: ${result.metadata.contentScore}/10`);
|
||||
|
||||
if (result.markdown && result.markdown.length > 0) {
|
||||
console.log(`📄 Markdown preview (300 chars):`);
|
||||
console.log(result.markdown.substring(0, 300) + '...');
|
||||
await fs.writeFile(`debug_${domain}_full.md`, result.markdown);
|
||||
console.log(`💾 Full output saved to debug_${domain}_full.md`);
|
||||
} else {
|
||||
console.log("❌ Empty markdown from full function");
|
||||
|
||||
// Debug why it's empty
|
||||
if (result.plainText && result.plainText.length > 0) {
|
||||
console.log("ℹ️ But plain text has content, so markdown conversion failed");
|
||||
await fs.writeFile(`debug_${domain}_plain.txt`, result.plainText);
|
||||
console.log(`💾 Plain text saved to debug_${domain}_plain.txt`);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error(`💥 Error processing ${url}:`, error);
|
||||
}
|
||||
|
||||
// Small delay between requests
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
}
|
||||
}
|
||||
|
||||
debugTurndown().catch(console.error);
|
||||
Reference in New Issue
Block a user