optimize scraping part 3 (#3183)
This commit is contained in:
@@ -1105,51 +1105,6 @@ function checkDisabledFromStyle(element) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// element should always be the parent of stopped_element
|
|
||||||
function getElementContext(element, stopped_element) {
|
|
||||||
// dfs to collect the non unique_id context
|
|
||||||
let fullContext = new Array();
|
|
||||||
|
|
||||||
if (element === stopped_element) {
|
|
||||||
return fullContext;
|
|
||||||
}
|
|
||||||
|
|
||||||
// sometimes '*' shows as an after custom style
|
|
||||||
const afterCustomStyle = getElementComputedStyle(element, "::after");
|
|
||||||
if (afterCustomStyle) {
|
|
||||||
const afterCustom = afterCustomStyle
|
|
||||||
.getPropertyValue("content")
|
|
||||||
.replace(/"/g, "");
|
|
||||||
if (
|
|
||||||
afterCustom.toLowerCase().includes("*") ||
|
|
||||||
afterCustom.toLowerCase().includes("require")
|
|
||||||
) {
|
|
||||||
fullContext.push(afterCustom);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (element.childNodes.length === 0) {
|
|
||||||
return fullContext.join(";");
|
|
||||||
}
|
|
||||||
// if the element already has a context, then add it to the list first
|
|
||||||
for (var child of element.childNodes) {
|
|
||||||
let childContext = "";
|
|
||||||
if (child.nodeType === Node.TEXT_NODE && isElementVisible(element)) {
|
|
||||||
if (!element.hasAttribute("unique_id")) {
|
|
||||||
childContext = getElementText(child).trim();
|
|
||||||
}
|
|
||||||
} else if (child.nodeType === Node.ELEMENT_NODE) {
|
|
||||||
if (!child.hasAttribute("unique_id") && isElementVisible(child)) {
|
|
||||||
childContext = getElementContext(child, stopped_element);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (childContext.length > 0) {
|
|
||||||
fullContext.push(childContext);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return fullContext.join(";");
|
|
||||||
}
|
|
||||||
|
|
||||||
function getVisibleText(element) {
|
function getVisibleText(element) {
|
||||||
let visibleText = [];
|
let visibleText = [];
|
||||||
|
|
||||||
@@ -1223,8 +1178,8 @@ function getElementContent(element, skipped_element = null) {
|
|||||||
nodeContent = cleanupText(nodeTextContentList.join(";"));
|
nodeContent = cleanupText(nodeTextContentList.join(";"));
|
||||||
}
|
}
|
||||||
let finalTextContent = cleanupText(textContent);
|
let finalTextContent = cleanupText(textContent);
|
||||||
// Currently we don't support too much context. Character limit is 1000 per element.
|
// Currently we don't support too much content. Character limit is 1000 per element.
|
||||||
// we don't think element context has to be that big
|
// we don't think element content has to be that big
|
||||||
const charLimit = 5000;
|
const charLimit = 5000;
|
||||||
if (finalTextContent.length > charLimit) {
|
if (finalTextContent.length > charLimit) {
|
||||||
if (nodeContent.length <= charLimit) {
|
if (nodeContent.length <= charLimit) {
|
||||||
@@ -1455,7 +1410,6 @@ async function buildElementTree(
|
|||||||
starter = document.body,
|
starter = document.body,
|
||||||
frame,
|
frame,
|
||||||
full_tree = false,
|
full_tree = false,
|
||||||
needContext = true,
|
|
||||||
hoverStylesMap = undefined,
|
hoverStylesMap = undefined,
|
||||||
) {
|
) {
|
||||||
// Generate hover styles map at the start
|
// Generate hover styles map at the start
|
||||||
@@ -1620,141 +1574,6 @@ async function buildElementTree(
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const getContextByParent = (element, ctx) => {
|
|
||||||
// for most elements, we're going 5 layers up to see if we can find "label" as a parent
|
|
||||||
// if found, most likely the context under label is relevant to this element
|
|
||||||
let targetParentElements = new Set(["label", "fieldset"]);
|
|
||||||
|
|
||||||
// look up for 5 levels to find the most contextual parent element
|
|
||||||
let targetContextualParent = null;
|
|
||||||
let currentEle = getDOMElementBySkyvenElement(element);
|
|
||||||
if (!currentEle) {
|
|
||||||
return ctx;
|
|
||||||
}
|
|
||||||
let parentEle = currentEle;
|
|
||||||
for (var i = 0; i < 5; i++) {
|
|
||||||
parentEle = parentEle.parentElement;
|
|
||||||
if (parentEle) {
|
|
||||||
if (
|
|
||||||
targetParentElements.has(parentEle.tagName.toLowerCase()) ||
|
|
||||||
(typeof parentEle.className === "string" &&
|
|
||||||
checkParentClass(parentEle.className.toLowerCase()))
|
|
||||||
) {
|
|
||||||
targetContextualParent = parentEle;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!targetContextualParent) {
|
|
||||||
return ctx;
|
|
||||||
}
|
|
||||||
|
|
||||||
let context = "";
|
|
||||||
var lowerCaseTagName = targetContextualParent.tagName.toLowerCase();
|
|
||||||
if (lowerCaseTagName === "fieldset") {
|
|
||||||
// fieldset is usually within a form or another element that contains the whole context
|
|
||||||
targetContextualParent = targetContextualParent.parentElement;
|
|
||||||
if (targetContextualParent) {
|
|
||||||
context = getElementContext(targetContextualParent, currentEle);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
context = getElementContext(targetContextualParent, currentEle);
|
|
||||||
}
|
|
||||||
if (context.length > 0) {
|
|
||||||
ctx.push(context);
|
|
||||||
}
|
|
||||||
return ctx;
|
|
||||||
};
|
|
||||||
|
|
||||||
const getContextByLinked = (element, ctx) => {
|
|
||||||
let currentEle = getDOMElementBySkyvenElement(element);
|
|
||||||
if (!currentEle) {
|
|
||||||
return ctx;
|
|
||||||
}
|
|
||||||
|
|
||||||
const document = currentEle.getRootNode();
|
|
||||||
// check labels pointed to this element
|
|
||||||
// 1. element id -> labels pointed to this id
|
|
||||||
// 2. by attr "aria-labelledby" -> only one label with this id
|
|
||||||
let linkedElements = new Array();
|
|
||||||
const elementId = currentEle.getAttribute("id");
|
|
||||||
if (elementId) {
|
|
||||||
try {
|
|
||||||
linkedElements = [
|
|
||||||
...document.querySelectorAll(`label[for="${elementId}"]`),
|
|
||||||
];
|
|
||||||
} catch (e) {
|
|
||||||
_jsConsoleLog("failed to query labels: ", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const labelled = currentEle.getAttribute("aria-labelledby");
|
|
||||||
if (labelled) {
|
|
||||||
const label = document.getElementById(labelled);
|
|
||||||
if (label) {
|
|
||||||
linkedElements.push(label);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const described = currentEle.getAttribute("aria-describedby");
|
|
||||||
if (described) {
|
|
||||||
const describe = document.getElementById(described);
|
|
||||||
if (describe) {
|
|
||||||
linkedElements.push(describe);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const fullContext = new Array();
|
|
||||||
for (let i = 0; i < linkedElements.length; i++) {
|
|
||||||
const linked = linkedElements[i];
|
|
||||||
// if the element is a child of the label, we should stop to get context before the element
|
|
||||||
const content = getElementContent(linked, currentEle);
|
|
||||||
if (content) {
|
|
||||||
fullContext.push(content);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const context = fullContext.join(";");
|
|
||||||
if (context.length > 0) {
|
|
||||||
ctx.push(context);
|
|
||||||
}
|
|
||||||
return ctx;
|
|
||||||
};
|
|
||||||
|
|
||||||
const getContextByTable = (element, ctx) => {
|
|
||||||
// pass element's parent's context to the element for listed tags
|
|
||||||
let tagsWithDirectParentContext = new Set(["a"]);
|
|
||||||
// if the element is a child of a td, th, or tr, then pass the grandparent's context to the element
|
|
||||||
let parentTagsThatDelegateParentContext = new Set(["td", "th", "tr"]);
|
|
||||||
if (tagsWithDirectParentContext.has(element.tagName)) {
|
|
||||||
let curElement = getDOMElementBySkyvenElement(element);
|
|
||||||
if (!curElement) {
|
|
||||||
return ctx;
|
|
||||||
}
|
|
||||||
let parentElement = curElement.parentElement;
|
|
||||||
if (!parentElement) {
|
|
||||||
return ctx;
|
|
||||||
}
|
|
||||||
if (
|
|
||||||
parentTagsThatDelegateParentContext.has(
|
|
||||||
parentElement.tagName.toLowerCase(),
|
|
||||||
)
|
|
||||||
) {
|
|
||||||
let grandParentElement = parentElement.parentElement;
|
|
||||||
if (grandParentElement) {
|
|
||||||
let context = getElementContext(grandParentElement, curElement);
|
|
||||||
if (context.length > 0) {
|
|
||||||
ctx.push(context);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let context = getElementContext(parentElement, curElement);
|
|
||||||
if (context.length > 0) {
|
|
||||||
ctx.push(context);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ctx;
|
|
||||||
};
|
|
||||||
|
|
||||||
const trimDuplicatedText = (element) => {
|
const trimDuplicatedText = (element) => {
|
||||||
if (element.children.length === 0 && !element.options) {
|
if (element.children.length === 0 && !element.options) {
|
||||||
return;
|
return;
|
||||||
@@ -1780,26 +1599,6 @@ async function buildElementTree(
|
|||||||
element.text = element.text.replace(new RegExp(`^;+|;+$`, "g"), "");
|
element.text = element.text.replace(new RegExp(`^;+|;+$`, "g"), "");
|
||||||
};
|
};
|
||||||
|
|
||||||
const trimDuplicatedContext = (element) => {
|
|
||||||
if (element.children.length === 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// DFS to delete duplicated context
|
|
||||||
element.children.forEach((child) => {
|
|
||||||
trimDuplicatedContext(child);
|
|
||||||
if (element.context === child.context) {
|
|
||||||
delete child.context;
|
|
||||||
}
|
|
||||||
if (child.context) {
|
|
||||||
child.context = child.context.replace(element.text, "");
|
|
||||||
if (!child.context) {
|
|
||||||
delete child.context;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
};
|
|
||||||
|
|
||||||
// some elements without children nodes should be removed out, such as <label>
|
// some elements without children nodes should be removed out, such as <label>
|
||||||
const removeOrphanNode = (results) => {
|
const removeOrphanNode = (results) => {
|
||||||
const trimmedResults = [];
|
const trimmedResults = [];
|
||||||
@@ -1845,48 +1644,11 @@ async function buildElementTree(
|
|||||||
element,
|
element,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let ctxList = [];
|
|
||||||
if (needContext) {
|
|
||||||
try {
|
|
||||||
ctxList = getContextByLinked(element, ctxList);
|
|
||||||
} catch (e) {
|
|
||||||
_jsConsoleError("failed to get context by linked: ", e);
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
ctxList = getContextByParent(element, ctxList);
|
|
||||||
} catch (e) {
|
|
||||||
_jsConsoleError("failed to get context by parent: ", e);
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
ctxList = getContextByTable(element, ctxList);
|
|
||||||
} catch (e) {
|
|
||||||
_jsConsoleError("failed to get context by table: ", e);
|
|
||||||
}
|
|
||||||
const context = ctxList.join(";");
|
|
||||||
if (context && context.length <= 5000) {
|
|
||||||
element.context = context;
|
|
||||||
}
|
|
||||||
// FIXME: skip <a> for now to prevent navigating to other page by mistake
|
|
||||||
if (element.tagName !== "a" && checkStringIncludeRequire(context)) {
|
|
||||||
if (
|
|
||||||
!element.attributes["required"] &&
|
|
||||||
!element.attributes["aria-required"]
|
|
||||||
) {
|
|
||||||
element.attributes["required"] = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
resultArray = removeOrphanNode(resultArray);
|
resultArray = removeOrphanNode(resultArray);
|
||||||
resultArray.forEach((root) => {
|
resultArray.forEach((root) => {
|
||||||
trimDuplicatedText(root);
|
trimDuplicatedText(root);
|
||||||
if (needContext) {
|
|
||||||
trimDuplicatedContext(root);
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
|
|
||||||
return [elements, resultArray];
|
return [elements, resultArray];
|
||||||
@@ -2411,7 +2173,6 @@ async function addIncrementalNodeToMap(parentNode, childrenNode) {
|
|||||||
child,
|
child,
|
||||||
"",
|
"",
|
||||||
true,
|
true,
|
||||||
false,
|
|
||||||
window.globalHoverStylesMap,
|
window.globalHoverStylesMap,
|
||||||
);
|
);
|
||||||
if (newNodeTree.length > 0) {
|
if (newNodeTree.length > 0) {
|
||||||
|
|||||||
Reference in New Issue
Block a user