optimize scraping part 4 (#3192)

This commit is contained in:
LawyZheng
2025-08-15 01:55:59 +08:00
committed by GitHub
parent 04fd540cd5
commit 81767e3189

View File

@@ -1194,16 +1194,42 @@ const checkParentClass = (className) => {
}; };
function removeMultipleSpaces(str) { function removeMultipleSpaces(str) {
if (!str) { // Optimization: check for empty values early
if (!str || str.length === 0) {
return str; return str;
} }
// Optimization: check if contains multiple spaces to avoid unnecessary regex replacement
if (
str.indexOf(" ") === -1 &&
str.indexOf("\t") === -1 &&
str.indexOf("\n") === -1
) {
return str;
}
return str.replace(/\s+/g, " "); return str.replace(/\s+/g, " ");
} }
function cleanupText(text) { function cleanupText(text) {
return removeMultipleSpaces( // Optimization: check for empty values early to avoid unnecessary processing
text.replace("SVGs not supported by this browser.", ""), if (!text || text.length === 0) {
).trim(); return "";
}
// Optimization: use more efficient string replacement
let cleanedText = text;
// Remove specific SVG error message
if (cleanedText.includes("SVGs not supported by this browser.")) {
cleanedText = cleanedText.replace(
"SVGs not supported by this browser.",
"",
);
}
// Optimization: combine space processing and trim operations
return removeMultipleSpaces(cleanedText).trim();
} }
const checkStringIncludeRequire = (str) => { const checkStringIncludeRequire = (str) => {
@@ -1269,62 +1295,29 @@ function getElementText(element) {
return element.data.trim(); return element.data.trim();
} }
let visibleText = []; const childNodes = element.childNodes;
for (let i = 0; i < element.childNodes.length; i++) { const childNodesLength = childNodes.length;
var node = element.childNodes[i];
let nodeText = "";
if (node.nodeType === Node.TEXT_NODE && (nodeText = node.data.trim())) {
visibleText.push(nodeText);
}
}
return visibleText.join(";");
}
function getElementContent(element, skipped_element = null) { // If no child nodes, return empty string directly
// DFS to get all the text content from all the nodes under the element if (childNodesLength === 0) {
if (skipped_element && element === skipped_element) {
return ""; return "";
} }
let textContent = getElementText(element); const visibleText = [];
let nodeContent = ""; let hasText = false;
// if element has children, then build a list of text and join with a semicolon
if (element.childNodes.length > 0) { for (let i = 0; i < childNodesLength; i++) {
let childTextContentList = new Array(); const node = childNodes[i];
let nodeTextContentList = new Array(); if (node.nodeType === Node.TEXT_NODE) {
for (var child of element.childNodes) { const nodeText = node.data.trim();
let childText = ""; if (nodeText.length > 0) {
if (child.nodeType === Node.TEXT_NODE) { visibleText.push(nodeText);
childText = getElementText(child).trim(); hasText = true;
if (childText.length > 0) {
nodeTextContentList.push(childText);
}
} else if (child.nodeType === Node.ELEMENT_NODE) {
// childText = child.textContent.trim();
childText = getElementContent(child, skipped_element);
} else {
_jsConsoleLog("Unhandled node type: ", child.nodeType);
} }
if (childText.length > 0) {
childTextContentList.push(childText);
}
}
textContent = childTextContentList.join(";");
nodeContent = cleanupText(nodeTextContentList.join(";"));
}
let finalTextContent = cleanupText(textContent);
// Currently we don't support too much content. Character limit is 1000 per element.
// we don't think element content has to be that big
const charLimit = 5000;
if (finalTextContent.length > charLimit) {
if (nodeContent.length <= charLimit) {
finalTextContent = nodeContent;
} else {
finalTextContent = "";
} }
} }
return finalTextContent; return hasText ? visibleText.join(";") : "";
} }
function getSelectOptions(element) { function getSelectOptions(element) {
@@ -1537,7 +1530,8 @@ async function buildTreeFromBody(
) { ) {
window.GlobalSkyvernFrameIndex = frame_index; window.GlobalSkyvernFrameIndex = frame_index;
} }
return await buildElementTree(document.body, frame); const elementsAndResultArray = await buildElementTree(document.body, frame);
return elementsAndResultArray;
} }
async function buildElementTree( async function buildElementTree(