Merge pull request #22 from amhsirak/develop

feat: `scrapeList`  & `scrapeListAuto` actions
This commit is contained in:
Karishma Shukla
2024-08-21 21:49:37 +05:30
committed by GitHub
16 changed files with 700 additions and 81 deletions

View File

@@ -23,7 +23,11 @@
"author": "Karishma Shukla", "author": "Karishma Shukla",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@cliqz/adblocker-playwright": "^1.31.3",
"cross-fetch": "^4.0.0",
"joi": "^17.6.0", "joi": "^17.6.0",
"playwright": "^1.20.1" "playwright": "^1.20.1",
"playwright-extra": "^4.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2"
} }
} }

View File

@@ -126,6 +126,85 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
return out; return out;
} }
async function scrollDownToLoadMore(selector, limit) {
let previousHeight = 0;
let itemsLoaded = 0;
while (itemsLoaded < limit) {
window.scrollBy(0, window.innerHeight);
await new Promise(resolve => setTimeout(resolve, 1000));
const currentHeight = document.body.scrollHeight;
if (currentHeight === previousHeight) {
break; // No more items to load
}
previousHeight = currentHeight;
itemsLoaded += document.querySelectorAll(selector).length;
}
}
async function scrollUpToLoadMore(selector, limit) {
let previousHeight = 0;
let itemsLoaded = 0;
while (itemsLoaded < limit) {
window.scrollBy(0, -window.innerHeight);
await new Promise(resolve => setTimeout(resolve, 1000));
const currentHeight = document.body.scrollHeight;
if (currentHeight === previousHeight) {
break; // No more items to load
}
previousHeight = currentHeight;
itemsLoaded += document.querySelectorAll(selector).length;
}
}
async function clickNextPagination(selector, scrapedData, limit) {
// Check if the limit is already met
if (scrapedData.length >= limit) {
return false; // Return false to indicate no further action is needed
}
// Check if a single "Next" button exists
let nextButton = document.querySelector(selector);
if (nextButton) {
nextButton.click();
return true; // Indicate that pagination occurred
} else {
// Handle pagination with numbers
const paginationButtons = document.querySelectorAll(selector);
let clicked = false;
// Loop through pagination buttons to find the current active page
for (let i = 0; i < paginationButtons.length - 1; i++) {
const button = paginationButtons[i];
if (button.classList.contains('active')) {
// Click the next button if available
const nextButtonInPagination = paginationButtons[i + 1];
if (nextButtonInPagination) {
nextButtonInPagination.click();
clicked = true;
break;
}
}
}
// If no next button was clicked, we might be on the last page
if (!clicked) {
throw new Error("No more items to load or pagination has ended.");
}
return clicked; // Indicate whether pagination occurred
}
}
/** /**
* Returns a "scrape" result from the current page. * Returns a "scrape" result from the current page.
* @returns {Array<Object>} *Curated* array of scraped information (with sparse rows removed) * @returns {Array<Object>} *Curated* array of scraped information (with sparse rows removed)
@@ -183,6 +262,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
}; };
/** /**
* TODO: Simplify.
* Given an object with named lists of elements, * Given an object with named lists of elements,
* groups the elements by their distance in the DOM tree. * groups the elements by their distance in the DOM tree.
* @param {Object.<string, {selector: string, tag: string}>} lists The named lists of HTML elements. * @param {Object.<string, {selector: string, tag: string}>} lists The named lists of HTML elements.
@@ -250,4 +330,134 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
)); ));
} }
/**
* Scrapes multiple lists of similar items based on a template item.
* @param {Object} config - Configuration object
* @param {string} config.listSelector - Selector for the list container(s)
* @param {Object.<string, {selector: string, attribute?: string}>} config.fields - Fields to scrape
* @param {number} [config.limit] - Maximum number of items to scrape per list (optional)
* @param {boolean} [config.flexible=false] - Whether to use flexible matching for field selectors
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
*/
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
const scrapedData = [];
while (scrapedData.length < limit) {
// Get all parent elements matching the listSelector
const parentElements = Array.from(document.querySelectorAll(listSelector));
// Iterate through each parent element
for (const parent of parentElements) {
if (scrapedData.length >= limit) break;
const record = {};
// For each field, select the corresponding element within the parent
for (const [label, { selector, attribute }] of Object.entries(fields)) {
const fieldElement = parent.querySelector(selector);
if (fieldElement) {
if (attribute === 'innerText') {
record[label] = fieldElement.innerText.trim();
} else if (attribute === 'innerHTML') {
record[label] = fieldElement.innerHTML.trim();
} else if (attribute === 'src') {
record[label] = fieldElement.src;
} else if (attribute === 'href') {
record[label] = fieldElement.href;
} else {
record[label] = fieldElement.getAttribute(attribute);
}
}
}
scrapedData.push(record);
}
}
return scrapedData
};
/**
* Gets all children of the elements matching the listSelector,
* returning their CSS selectors and innerText.
* @param {string} listSelector - Selector for the list container(s)
* @returns {Array.<Object>} Array of objects, each containing the CSS selector and innerText of the children
*/
window.scrapeListAuto = function (listSelector) {
const lists = Array.from(document.querySelectorAll(listSelector));
const results = [];
lists.forEach(list => {
const children = Array.from(list.children);
children.forEach(child => {
const selectors = [];
let element = child;
// Traverse up to gather the CSS selector for the element
while (element && element !== document) {
let selector = element.nodeName.toLowerCase();
if (element.id) {
selector += `#${element.id}`;
selectors.push(selector);
break;
} else {
const className = element.className.trim().split(/\s+/).join('.');
if (className) {
selector += `.${className}`;
}
selectors.push(selector);
element = element.parentElement;
}
}
results.push({
selector: selectors.reverse().join(' > '),
innerText: child.innerText.trim()
});
});
});
return results;
};
window.scrollDown = async function (selector, limit) {
let previousHeight = 0;
let itemsLoaded = 0;
while (itemsLoaded < limit) {
window.scrollTo(0, document.body.scrollHeight);
await new Promise(resolve => setTimeout(resolve, 1000));
const currentHeight = document.body.scrollHeight;
if (currentHeight === previousHeight) {
break; // No more items to load
}
previousHeight = currentHeight;
itemsLoaded += document.querySelectorAll(selector).length;
}
}
window.scrollUp = async function (selector, limit) {
let previousHeight = 0;
let itemsLoaded = 0;
while (itemsLoaded < limit) {
window.scrollBy(0, -window.innerHeight);
await new Promise(resolve => setTimeout(resolve, 1000));
const currentHeight = document.body.scrollHeight;
if (currentHeight === previousHeight) {
break; // No more items to load
}
previousHeight = currentHeight;
itemsLoaded += document.querySelectorAll(selector).length;
}
}
})(window); })(window);

View File

@@ -1,5 +1,7 @@
/* eslint-disable no-await-in-loop, no-restricted-syntax */ /* eslint-disable no-await-in-loop, no-restricted-syntax */
import { Page, PageScreenshotOptions } from 'playwright'; import { Page, PageScreenshotOptions } from 'playwright';
import { PlaywrightBlocker } from '@cliqz/adblocker-playwright';
import fetch from 'cross-fetch';
import path from 'path'; import path from 'path';
import { EventEmitter } from 'events'; import { EventEmitter } from 'events';
@@ -29,6 +31,7 @@ interface InterpreterOptions {
}> }>
} }
/** /**
* Class for running the Smart Workflows. * Class for running the Smart Workflows.
*/ */
@@ -45,6 +48,8 @@ export default class Interpreter extends EventEmitter {
private log: typeof log; private log: typeof log;
private blocker: PlaywrightBlocker | null = null;
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>) { constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>) {
super(); super();
this.workflow = workflow.workflow; this.workflow = workflow.workflow;
@@ -76,6 +81,24 @@ export default class Interpreter extends EventEmitter {
oldLog(...args); oldLog(...args);
}; };
} }
PlaywrightBlocker.fromPrebuiltAdsAndTracking(fetch).then(blocker => {
this.blocker = blocker;
}).catch(err => {
this.log(`Failed to initialize ad-blocker:`, Level.ERROR);
})
}
private async applyAdBlocker(page: Page): Promise<void> {
if (this.blocker) {
await this.blocker.enableBlockingInPage(page);
}
}
private async disableAdBlocker(page: Page): Promise<void> {
if (this.blocker) {
await this.blocker.disableBlockingInPage(page);
}
} }
/** /**
@@ -285,11 +308,32 @@ export default class Interpreter extends EventEmitter {
scrapeSchema: async (schema: Record<string, { selector: string; tag: string, attribute: string; }>) => { scrapeSchema: async (schema: Record<string, { selector: string; tag: string, attribute: string; }>) => {
await this.ensureScriptsLoaded(page); await this.ensureScriptsLoaded(page);
const scrapeResult = await page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema); const scrapeResult = await page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
await this.options.serializableCallback(scrapeResult); await this.options.serializableCallback(scrapeResult);
}, },
scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }) => {
await this.ensureScriptsLoaded(page);
if (!config.pagination) {
const scrapeResults: Record<string, any>[] = await page.evaluate((cfg) => window.scrapeList(cfg), config);
await this.options.serializableCallback(scrapeResults);
} else {
const scrapeResults: Record<string, any>[] = await this.handlePagination(page, config);
await this.options.serializableCallback(scrapeResults);
}
},
scrapeListAuto: async (config: { listSelector: string }) => {
await this.ensureScriptsLoaded(page);
const scrapeResults: { selector: string, innerText: string }[] = await page.evaluate((listSelector) => {
return window.scrapeListAuto(listSelector);
}, config.listSelector);
await this.options.serializableCallback(scrapeResults);
},
scroll: async (pages?: number) => { scroll: async (pages?: number) => {
await page.evaluate(async (pagesInternal) => { await page.evaluate(async (pagesInternal) => {
for (let i = 1; i <= (pagesInternal ?? 1); i += 1) { for (let i = 1; i <= (pagesInternal ?? 1); i += 1) {
@@ -298,6 +342,7 @@ export default class Interpreter extends EventEmitter {
} }
}, pages ?? 1); }, pages ?? 1);
}, },
script: async (code: string) => { script: async (code: string) => {
const AsyncFunction: FunctionConstructor = Object.getPrototypeOf( const AsyncFunction: FunctionConstructor = Object.getPrototypeOf(
async () => { }, async () => { },
@@ -305,6 +350,7 @@ export default class Interpreter extends EventEmitter {
const x = new AsyncFunction('page', 'log', code); const x = new AsyncFunction('page', 'log', code);
await x(page, this.log); await x(page, this.log);
}, },
flag: async () => new Promise((res) => { flag: async () => new Promise((res) => {
this.emit('flag', page, res); this.emit('flag', page, res);
}), }),
@@ -338,7 +384,82 @@ export default class Interpreter extends EventEmitter {
} }
} }
private async handlePagination(page: Page, config: { listSelector: string, fields: any, limit?: number, pagination: any }) {
let allResults: Record<string, any>[] = [];
let previousHeight = 0;
// track unique items per page to avoid re-scraping
let scrapedItems: Set<string> = new Set<string>();
while (true) {
switch (config.pagination.type) {
case 'scrollDown':
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(2000);
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) {
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
previousHeight = currentHeight;
break;
case 'scrollUp':
break;
case 'clickNext':
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
// Filter out already scraped items
const newResults = pageResults.filter(item => {
const uniqueKey = JSON.stringify(item);
if (scrapedItems.has(uniqueKey)) return false; // Ignore if already scraped
scrapedItems.add(uniqueKey); // Mark as scraped
return true;
});
allResults = allResults.concat(newResults);
if (config.limit && allResults.length >= config.limit) {
return allResults.slice(0, config.limit);
}
const nextButton = await page.$(config.pagination.selector);
if (!nextButton) {
return allResults; // No more pages to scrape
}
await Promise.all([
nextButton.click(),
page.waitForNavigation({ waitUntil: 'networkidle' })
]);
await page.waitForTimeout(1000);
break;
case 'clickLoadMore':
const loadMoreButton = await page.$(config.pagination.selector);
if (!loadMoreButton) {
return allResults;
}
await loadMoreButton.click();
break;
default:
const results = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(results);
return allResults;
}
if (config.limit && allResults.length >= config.limit) {
allResults = allResults.slice(0, config.limit);
break;
}
}
return allResults;
}
private async runLoop(p: Page, workflow: Workflow) { private async runLoop(p: Page, workflow: Workflow) {
// apply ad-blocker to the current page
await this.applyAdBlocker(p);
const usedActions: string[] = []; const usedActions: string[] = [];
let lastAction = null; let lastAction = null;
let repeatCount = 0; let repeatCount = 0;
@@ -404,13 +525,14 @@ export default class Interpreter extends EventEmitter {
this.log(<Error>e, Level.ERROR); this.log(<Error>e, Level.ERROR);
} }
} else { } else {
//await this.disableAdBlocker(p);
return; return;
} }
} }
} }
private async ensureScriptsLoaded(page: Page) { private async ensureScriptsLoaded(page: Page) {
const isScriptLoaded = await page.evaluate(() => typeof window.scrape === 'function' && typeof window.scrapeSchema === 'function'); const isScriptLoaded = await page.evaluate(() => typeof window.scrape === 'function' && typeof window.scrapeSchema === 'function' && typeof window.scrapeList === 'function' && typeof window.scrapeListAuto === 'function' && typeof window.scrollDown === 'function' && typeof window.scrollUp === 'function');
if (!isScriptLoaded) { if (!isScriptLoaded) {
await page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') }); await page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
} }

View File

@@ -46,11 +46,11 @@ export default class Preprocessor {
return error; return error;
} }
/** /**
* Extracts parameter names from the workflow. * Extracts parameter names from the workflow.
* @param {WorkflowFile} workflow The given workflow * @param {WorkflowFile} workflow The given workflow
* @returns {String[]} List of parameters' names. * @returns {String[]} List of parameters' names.
*/ */
static getParams(workflow: WorkflowFile): string[] { static getParams(workflow: WorkflowFile): string[] {
const getParamsRecurse = (object: any): string[] => { const getParamsRecurse = (object: any): string[] => {
if (typeof object === 'object') { if (typeof object === 'object') {
@@ -69,10 +69,10 @@ export default class Preprocessor {
return getParamsRecurse(workflow.workflow); return getParamsRecurse(workflow.workflow);
} }
/** /**
* List all the selectors used in the given workflow (only literal "selector" * List all the selectors used in the given workflow (only literal "selector"
* field in WHERE clauses so far) * field in WHERE clauses so far)
*/ */
// TODO : add recursive selector search (also in click/fill etc. events?) // TODO : add recursive selector search (also in click/fill etc. events?)
static extractSelectors(workflow: Workflow): SelectorArray { static extractSelectors(workflow: Workflow): SelectorArray {
/** /**
@@ -107,11 +107,11 @@ export default class Preprocessor {
], []); ], []);
} }
/** /**
* Recursively crawl `object` and initializes params - replaces the `{$param : paramName}` objects * Recursively crawl `object` and initializes params - replaces the `{$param : paramName}` objects
* with the defined value. * with the defined value.
* @returns {Workflow} Copy of the given workflow, modified (the initial workflow is left untouched). * @returns {Workflow} Copy of the given workflow, modified (the initial workflow is left untouched).
*/ */
static initWorkflow(workflow: Workflow, params?: ParamType): Workflow { static initWorkflow(workflow: Workflow, params?: ParamType): Workflow {
const paramNames = this.getParams({ workflow }); const paramNames = this.getParams({ workflow });

View File

@@ -28,7 +28,7 @@ type MethodNames<T> = {
[K in keyof T]: T[K] extends Function ? K : never; [K in keyof T]: T[K] extends Function ? K : never;
}[keyof T]; }[keyof T];
export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag'; export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag' | 'scrapeList' | 'scrapeListAuto';
export type What = { export type What = {
action: MethodNames<Page> | CustomFunctions, action: MethodNames<Page> | CustomFunctions,

View File

@@ -7,6 +7,7 @@ import {
getElementInformation, getElementInformation,
getRect, getRect,
getSelectors, getSelectors,
getNonUniqueSelectors,
isRuleOvershadowing, isRuleOvershadowing,
selectorAlreadyInWorkflow selectorAlreadyInWorkflow
} from "../selector"; } from "../selector";
@@ -46,6 +47,12 @@ export class WorkflowGenerator {
*/ */
private socket: Socket; private socket: Socket;
/**
* getList is one of the custom actions from maxun-core.
* Used to provide appropriate selectors for the getList action.
*/
private getList: boolean = false;
/** /**
* The public constructor of the WorkflowGenerator. * The public constructor of the WorkflowGenerator.
* Takes socket for communication as a parameter and registers some important events on it. * Takes socket for communication as a parameter and registers some important events on it.
@@ -55,6 +62,7 @@ export class WorkflowGenerator {
public constructor(socket: Socket) { public constructor(socket: Socket) {
this.socket = socket; this.socket = socket;
this.registerEventHandlers(socket); this.registerEventHandlers(socket);
this.initializeSocketListeners();
} }
/** /**
@@ -88,6 +96,15 @@ export class WorkflowGenerator {
lastAction: '', lastAction: '',
} }
/**
* Initializes the socket listeners for the generator.
*/
private initializeSocketListeners() {
this.socket.on('setGetList', (data: { getList: boolean }) => {
this.getList = data.getList;
});
}
/** /**
* Registers the event handlers for all generator-related events on the socket. * Registers the event handlers for all generator-related events on the socket.
* @param socket The socket used to communicate with the client. * @param socket The socket used to communicate with the client.
@@ -459,13 +476,17 @@ export class WorkflowGenerator {
*/ */
private generateSelector = async (page: Page, coordinates: Coordinates, action: ActionType) => { private generateSelector = async (page: Page, coordinates: Coordinates, action: ActionType) => {
const elementInfo = await getElementInformation(page, coordinates); const elementInfo = await getElementInformation(page, coordinates);
const selectorBasedOnCustomAction = (this.getList === true)
? await getNonUniqueSelectors(page, coordinates)
: await getSelectors(page, coordinates);
const bestSelector = getBestSelectorForAction( const bestSelector = getBestSelectorForAction(
{ {
type: action, type: action,
tagName: elementInfo?.tagName as TagName || '', tagName: elementInfo?.tagName as TagName || '',
inputType: undefined, inputType: undefined,
value: undefined, value: undefined,
selectors: await getSelectors(page, coordinates) || {}, selectors: selectorBasedOnCustomAction || {},
timestamp: 0, timestamp: 0,
isPassword: false, isPassword: false,
hasOnlyText: elementInfo?.hasOnlyText || false, hasOnlyText: elementInfo?.hasOnlyText || false,
@@ -488,6 +509,8 @@ export class WorkflowGenerator {
if (rect) { if (rect) {
this.socket.emit('highlighter', { rect, selector: displaySelector, elementInfo }); this.socket.emit('highlighter', { rect, selector: displaySelector, elementInfo });
} }
// reset getList after usage
this.getList = false;
} }
/** /**

View File

@@ -98,20 +98,20 @@ export const getElementInformation = async (
{ x: coordinates.x, y: coordinates.y }, { x: coordinates.x, y: coordinates.y },
); );
if (elementInfo) { // if (elementInfo) {
if (elementInfo.tagName === 'A') { // if (elementInfo.tagName === 'A') {
if (elementInfo.innerText) { // if (elementInfo.innerText) {
console.log(`Link text: ${elementInfo.innerText}, URL: ${elementInfo.url}`); // console.log(`Link text: ${elementInfo.innerText}, URL: ${elementInfo.url}`);
} else { // } else {
console.log(`URL: ${elementInfo.url}`); // console.log(`URL: ${elementInfo.url}`);
} // }
} else if (elementInfo.tagName === 'IMG') { // } else if (elementInfo.tagName === 'IMG') {
console.log(`Image URL: ${elementInfo.imageUrl}`); // console.log(`Image URL: ${elementInfo.imageUrl}`);
} else { // } else {
console.log(`Element innerText: ${elementInfo.innerText}`); // console.log(`Element innerText: ${elementInfo.innerText}`);
} // }
} // }
return elementInfo; return elementInfo;
} catch (error) { } catch (error) {
const { message, stack } = error as Error; const { message, stack } = error as Error;
@@ -721,6 +721,66 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => {
return null; return null;
}; };
/**
* Returns the best non-unique css {@link Selectors} for the element on the page.
* @param page The page instance.
* @param coordinates Coordinates of an element.
* @category WorkflowManagement-Selectors
* @returns {Promise<Selectors|null|undefined>}
*/
export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates) => {
try {
const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => {
function getNonUniqueSelector(element: HTMLElement): string {
let selector = element.tagName.toLowerCase();
// Avoid using IDs to maintain non-uniqueness
if (element.className) {
const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls));
if (classes.length > 0) {
// Exclude utility classes and escape special characters
const validClasses = classes.filter((cls: string) => !cls.startsWith('!') && !cls.includes(':'));
if (validClasses.length > 0) {
selector += '.' + validClasses.map(cls => CSS.escape(cls)).join('.');
}
}
}
return selector;
}
function getSelectorPath(element: HTMLElement | null): string {
const path: string[] = [];
while (element && element !== document.body) {
const selector = getNonUniqueSelector(element);
path.unshift(selector);
element = element.parentElement;
}
return path.join(' > ');
}
const element = document.elementFromPoint(x, y) as HTMLElement | null;
if (!element) return null;
const generalSelector = getSelectorPath(element);
return {
generalSelector,
};
}, coordinates);
return selectors || {};
} catch (error) {
console.error('Error in getNonUniqueSelectors:', error);
return {};
}
};
/** /**
* Returns the first pair from the given workflow that contains the given selector * Returns the first pair from the given workflow that contains the given selector
* inside the where condition, and it is the only selector there. * inside the where condition, and it is the only selector there.

View File

@@ -24,10 +24,6 @@ export const Highlighter = ({ unmodifiedRect, displayedSelector = '', width, hei
}; };
//console.log('unmodifiedRect:', unmodifiedRect)
//console.log('rectangle:', rect)
//console.log('canvas rectangle:', canvasRect)
return ( return (
<div> <div>
<HighlighterOutline <HighlighterOutline
@@ -54,7 +50,7 @@ const HighlighterOutline = styled.div<HighlighterOutlineProps>`
pointer-events: none !important; pointer-events: none !important;
position: fixed !important; position: fixed !important;
background: #ff5d5b26 !important; background: #ff5d5b26 !important;
outline: 4px solid pink !important; outline: 4px solid red !important;
//border: 4px solid #ff5d5b !important; //border: 4px solid #ff5d5b !important;
z-index: 2147483647 !important; z-index: 2147483647 !important;
//border-radius: 5px; //border-radius: 5px;

View File

@@ -27,8 +27,9 @@ const Canvas = ({ width, height, onCreateRef }: CanvasProps) => {
const canvasRef = useRef<HTMLCanvasElement>(null); const canvasRef = useRef<HTMLCanvasElement>(null);
const { socket } = useSocketStore(); const { socket } = useSocketStore();
const { setLastAction, lastAction } = useGlobalInfoStore(); const { setLastAction, lastAction } = useGlobalInfoStore();
const { getText, getScreenshot } = useActionContext(); const { getText, getList } = useActionContext();
const getTextRef = useRef(getText); const getTextRef = useRef(getText);
const getListRef = useRef(getList);
const notifyLastAction = (action: string) => { const notifyLastAction = (action: string) => {
if (lastAction !== action) { if (lastAction !== action) {
@@ -40,7 +41,8 @@ const Canvas = ({ width, height, onCreateRef }: CanvasProps) => {
useEffect(() => { useEffect(() => {
getTextRef.current = getText; getTextRef.current = getText;
}, [getText]); getListRef.current = getList;
}, [getText, getList]);
const onMouseEvent = useCallback((event: MouseEvent) => { const onMouseEvent = useCallback((event: MouseEvent) => {
if (socket) { if (socket) {
@@ -51,8 +53,9 @@ const Canvas = ({ width, height, onCreateRef }: CanvasProps) => {
switch (event.type) { switch (event.type) {
case 'mousedown': case 'mousedown':
const clickCoordinates = getMappedCoordinates(event, canvasRef.current, width, height); const clickCoordinates = getMappedCoordinates(event, canvasRef.current, width, height);
if (getTextRef.current === true) { if (getTextRef.current === true || getListRef.current === true) {
console.log('get text') // todo: remove console.log and return
console.log('get text or get list is true');
} else { } else {
socket.emit('input:mousedown', clickCoordinates); socket.emit('input:mousedown', clickCoordinates);
} }

View File

@@ -27,7 +27,7 @@ export const InterpretationLog = () => {
} }
const handleLog = useCallback((msg: string, date: boolean = true) => { const handleLog = useCallback((msg: string, date: boolean = true) => {
if (!date){ if (!date) {
setLog((prevState) => prevState + '\n' + msg); setLog((prevState) => prevState + '\n' + msg);
} else { } else {
setLog((prevState) => prevState + '\n' + `[${new Date().toLocaleString()}] ` + msg); setLog((prevState) => prevState + '\n' + `[${new Date().toLocaleString()}] ` + msg);
@@ -42,9 +42,9 @@ export const InterpretationLog = () => {
scrollLogToBottom(); scrollLogToBottom();
}, [log, scrollLogToBottom]) }, [log, scrollLogToBottom])
const handleBinaryCallback = useCallback(({data, mimetype}: any) => { const handleBinaryCallback = useCallback(({ data, mimetype }: any) => {
setLog((prevState) => setLog((prevState) =>
prevState + '\n' + '---------- Binary output data received ----------' + '\n' prevState + '\n' + '---------- Binary output data received ----------' + '\n'
+ `mimetype: ${mimetype}` + '\n' + `data: ${JSON.stringify(data)}` + '\n' + `mimetype: ${mimetype}` + '\n' + `data: ${JSON.stringify(data)}` + '\n'
+ '------------------------------------------------'); + '------------------------------------------------');
scrollLogToBottom(); scrollLogToBottom();
@@ -66,10 +66,10 @@ export const InterpretationLog = () => {
<Accordion <Accordion
expanded={expanded} expanded={expanded}
onChange={handleChange(!expanded)} onChange={handleChange(!expanded)}
style={{background: '#3f4853', color: 'white', borderRadius: '0px'}} style={{ background: '#3f4853', color: 'white', borderRadius: '0px' }}
> >
<AccordionSummary <AccordionSummary
expandIcon={<ExpandMoreIcon sx={{color: 'white'}}/>} expandIcon={<ExpandMoreIcon sx={{ color: 'white' }} />}
aria-controls="panel1bh-content" aria-controls="panel1bh-content"
id="panel1bh-header" id="panel1bh-header"
> >
@@ -88,8 +88,8 @@ export const InterpretationLog = () => {
<Highlight className="javascript"> <Highlight className="javascript">
{log} {log}
</Highlight> </Highlight>
<div style={{ float:"left", clear: "both" }} <div style={{ float: "left", clear: "both" }}
ref={logEndRef}/> ref={logEndRef} />
</div> </div>
</AccordionDetails> </AccordionDetails>
</Accordion> </Accordion>

View File

@@ -5,7 +5,7 @@ import { useBrowserDimensionsStore } from "../../context/browserDimensions";
import { Highlighter } from "../atoms/Highlighter"; import { Highlighter } from "../atoms/Highlighter";
import { GenericModal } from '../atoms/GenericModal'; import { GenericModal } from '../atoms/GenericModal';
import { useActionContext } from '../../context/browserActions'; import { useActionContext } from '../../context/browserActions';
import { useBrowserSteps } from '../../context/browserSteps'; import { useBrowserSteps, TextStep } from '../../context/browserSteps';
interface ElementInfo { interface ElementInfo {
tagName: string; tagName: string;
@@ -45,10 +45,13 @@ export const BrowserWindow = () => {
const [attributeOptions, setAttributeOptions] = useState<AttributeOption[]>([]); const [attributeOptions, setAttributeOptions] = useState<AttributeOption[]>([]);
const [selectedElement, setSelectedElement] = useState<{ selector: string, info: ElementInfo | null } | null>(null); const [selectedElement, setSelectedElement] = useState<{ selector: string, info: ElementInfo | null } | null>(null);
const [listSelector, setListSelector] = useState<string | null>(null);
const [fields, setFields] = useState<Record<string, TextStep>>({});
const { socket } = useSocketStore(); const { socket } = useSocketStore();
const { width, height } = useBrowserDimensionsStore(); const { width, height } = useBrowserDimensionsStore();
const { getText } = useActionContext(); const { getText, getList } = useActionContext();
const { addTextStep } = useBrowserSteps(); const { addTextStep, addListStep } = useBrowserSteps();
const onMouseMove = (e: MouseEvent) => { const onMouseMove = (e: MouseEvent) => {
if (canvasRef && canvasRef.current && highlighterData) { if (canvasRef && canvasRef.current && highlighterData) {
@@ -84,8 +87,11 @@ export const BrowserWindow = () => {
}, [screenShot, canvasRef, socket, screencastHandler]); }, [screenShot, canvasRef, socket, screencastHandler]);
const highlighterHandler = useCallback((data: { rect: DOMRect, selector: string, elementInfo: ElementInfo | null }) => { const highlighterHandler = useCallback((data: { rect: DOMRect, selector: string, elementInfo: ElementInfo | null }) => {
if (getList === true) {
socket?.emit('setGetList', { getList: true });
}
setHighlighterData(data); setHighlighterData(data);
}, [highlighterData]) }, [highlighterData, getList, socket]);
useEffect(() => { useEffect(() => {
document.addEventListener('mousemove', onMouseMove, false); document.addEventListener('mousemove', onMouseMove, false);
@@ -128,6 +134,45 @@ export const BrowserWindow = () => {
}); });
} }
} }
if (getList === true && !listSelector) {
setListSelector(highlighterData.selector);
} else if (getList === true && listSelector) {
const options = getAttributeOptions(highlighterData.elementInfo?.tagName || '');
if (options.length > 1) {
setAttributeOptions(options);
setSelectedElement({
selector: highlighterData.selector,
info: highlighterData.elementInfo
});
setShowAttributeModal(true);
} else {
const newField: TextStep = {
id: Date.now(),
type: 'text',
label: `Label ${Object.keys(fields).length + 1}`,
data: highlighterData.elementInfo?.innerText || '',
selectorObj: {
selector: highlighterData.selector,
tag: highlighterData.elementInfo?.tagName,
attribute: 'innerText'
}
};
setFields(prevFields => {
const updatedFields = {
...prevFields,
[newField.label]: newField
};
return updatedFields;
});
if (listSelector) {
addListStep(listSelector, { ...fields, [newField.label]: newField });
}
}
}
} }
} }
}; };
@@ -153,6 +198,31 @@ export const BrowserWindow = () => {
attribute: attribute attribute: attribute
}); });
} }
if (getList === true) {
const newField: TextStep = {
id: Date.now(),
type: 'text',
label: `Label ${Object.keys(fields).length + 1}`,
data: selectedElement.info?.innerText || '',
selectorObj: {
selector: selectedElement.selector,
tag: selectedElement.info?.tagName,
attribute: attribute
}
};
setFields(prevFields => {
const updatedFields = {
...prevFields,
[newField.label]: newField
};
return updatedFields;
});
if (listSelector) {
addListStep(listSelector, { ...fields, [newField.label]: newField });
}
}
} }
} }
setShowAttributeModal(false); setShowAttributeModal(false);
@@ -161,7 +231,7 @@ export const BrowserWindow = () => {
return ( return (
<div onClick={handleClick}> <div onClick={handleClick}>
{ {
getText === true ? ( getText === true || getList === true ? (
<GenericModal <GenericModal
isOpen={showAttributeModal} isOpen={showAttributeModal}
onClose={() => { }} onClose={() => { }}
@@ -179,7 +249,7 @@ export const BrowserWindow = () => {
</GenericModal> </GenericModal>
) : null ) : null
} }
{(getText === true && !showAttributeModal && highlighterData?.rect != null && highlighterData?.rect.top != null) && canvasRef?.current ? {((getText === true || getList === true) && !showAttributeModal && highlighterData?.rect != null && highlighterData?.rect.top != null) && canvasRef?.current ?
<Highlighter <Highlighter
unmodifiedRect={highlighterData?.rect} unmodifiedRect={highlighterData?.rect}
displayedSelector={highlighterData?.selector} displayedSelector={highlighterData?.selector}

View File

@@ -8,11 +8,16 @@ import { SimpleBox } from "../atoms/Box";
import Typography from "@mui/material/Typography"; import Typography from "@mui/material/Typography";
import { useGlobalInfoStore } from "../../context/globalInfo"; import { useGlobalInfoStore } from "../../context/globalInfo";
import { useActionContext } from '../../context/browserActions'; import { useActionContext } from '../../context/browserActions';
import { useBrowserSteps } from '../../context/browserSteps'; import { useBrowserSteps, ListStep, TextStep, SelectorObject } from '../../context/browserSteps';
import { useSocketStore } from '../../context/socket'; import { useSocketStore } from '../../context/socket';
import { ScreenshotSettings } from '../../shared/types'; import { ScreenshotSettings } from '../../shared/types';
import InputAdornment from '@mui/material/InputAdornment'; import InputAdornment from '@mui/material/InputAdornment';
// TODO:
// 1. Handle field label update
// 2. Handle field deletion | confirmation
// 3. Add description for each browser step
// 4. Handle non custom action steps
export const RightSidePanel = () => { export const RightSidePanel = () => {
const [textLabels, setTextLabels] = useState<{ [id: number]: string }>({}); const [textLabels, setTextLabels] = useState<{ [id: number]: string }>({});
@@ -20,7 +25,7 @@ export const RightSidePanel = () => {
const [confirmedTextSteps, setConfirmedTextSteps] = useState<{ [id: number]: boolean }>({}); const [confirmedTextSteps, setConfirmedTextSteps] = useState<{ [id: number]: boolean }>({});
const { lastAction, notify } = useGlobalInfoStore(); const { lastAction, notify } = useGlobalInfoStore();
const { getText, startGetText, stopGetText, getScreenshot, startGetScreenshot, stopGetScreenshot } = useActionContext(); const { getText, startGetText, stopGetText, getScreenshot, startGetScreenshot, stopGetScreenshot, getList, startGetList, stopGetList } = useActionContext();
const { browserSteps, updateBrowserTextStepLabel, deleteBrowserStep, addScreenshotStep } = useBrowserSteps(); const { browserSteps, updateBrowserTextStepLabel, deleteBrowserStep, addScreenshotStep } = useBrowserSteps();
const { socket } = useSocketStore(); const { socket } = useSocketStore();
@@ -80,6 +85,49 @@ export const RightSidePanel = () => {
} }
}, [stopGetText, getTextSettingsObject, socket, browserSteps, confirmedTextSteps]); }, [stopGetText, getTextSettingsObject, socket, browserSteps, confirmedTextSteps]);
const getListSettingsObject = useCallback(() => {
let settings: { listSelector?: string; fields?: Record<string, { selector: string; tag?: string;[key: string]: any }> } = {};
browserSteps.forEach(step => {
if (step.type === 'list' && step.listSelector && Object.keys(step.fields).length > 0) {
const fields: Record<string, { selector: string; tag?: string;[key: string]: any }> = {};
Object.entries(step.fields).forEach(([label, field]) => {
if (field.selectorObj?.selector) {
fields[label] = {
selector: field.selectorObj.selector,
tag: field.selectorObj.tag,
attribute: field.selectorObj.attribute
};
}
});
settings = {
listSelector: step.listSelector,
fields: fields
};
}
});
return settings;
}, [browserSteps]);
const stopCaptureAndEmitGetListSettings = useCallback(() => {
stopGetList();
const settings = getListSettingsObject();
if (settings) {
socket?.emit('action', { action: 'scrapeList', settings });
} else {
notify('error', 'Unable to create list settings. Make sure you have defined a field for the list.');
}
}, [stopGetList, getListSettingsObject, socket, notify]);
// const handleListFieldChange = (stepId: number, key: 'label' | 'data', value: string) => {
// updateListStepField(stepId, key, value);
// };
const captureScreenshot = (fullPage: boolean) => { const captureScreenshot = (fullPage: boolean) => {
const screenshotSettings: ScreenshotSettings = { const screenshotSettings: ScreenshotSettings = {
fullPage, fullPage,
@@ -101,7 +149,17 @@ export const RightSidePanel = () => {
</SimpleBox> </SimpleBox>
<Box display="flex" flexDirection="column" gap={2} style={{ margin: '15px' }}> <Box display="flex" flexDirection="column" gap={2} style={{ margin: '15px' }}>
{!getText && !getScreenshot && <Button variant="contained" onClick={startGetText}>Capture Text</Button>} {!getText && !getScreenshot && !getList && <Button variant="contained" onClick={startGetList}>Capture List</Button>}
{getList &&
<>
<Box display="flex" justifyContent="space-between" gap={2} style={{ margin: '15px' }}>
<Button variant="outlined" onClick={stopCaptureAndEmitGetListSettings}>Confirm</Button>
<Button variant="outlined" color="error" onClick={stopGetList}>Discard</Button>
</Box>
</>
}
{!getText && !getScreenshot && !getList && <Button variant="contained" onClick={startGetText}>Capture Text</Button>}
{getText && {getText &&
<> <>
<Box display="flex" justifyContent="space-between" gap={2} style={{ margin: '15px' }}> <Box display="flex" justifyContent="space-between" gap={2} style={{ margin: '15px' }}>
@@ -111,7 +169,7 @@ export const RightSidePanel = () => {
</> </>
} }
{!getText && !getScreenshot && <Button variant="contained" onClick={startGetScreenshot}>Capture Screenshot</Button>} {!getText && !getScreenshot && !getList && <Button variant="contained" onClick={startGetScreenshot}>Capture Screenshot</Button>}
{getScreenshot && ( {getScreenshot && (
<Box display="flex" flexDirection="column" gap={2}> <Box display="flex" flexDirection="column" gap={2}>
<Button variant="contained" onClick={() => captureScreenshot(true)}>Capture Fullpage</Button> <Button variant="contained" onClick={() => captureScreenshot(true)}>Capture Fullpage</Button>
@@ -125,7 +183,7 @@ export const RightSidePanel = () => {
{browserSteps.map(step => ( {browserSteps.map(step => (
<Box key={step.id} sx={{ boxShadow: 5, padding: '10px', margin: '10px', borderRadius: '4px' }}> <Box key={step.id} sx={{ boxShadow: 5, padding: '10px', margin: '10px', borderRadius: '4px' }}>
{ {
step.type === 'text' ? ( step.type === 'text' && (
<> <>
<TextField <TextField
label="Label" label="Label"
@@ -165,24 +223,55 @@ export const RightSidePanel = () => {
</Box> </Box>
)} )}
</> </>
) : ( )}
step.type === 'screenshot' && ( {step.type === 'screenshot' && (
<Box display="flex" alignItems="center"> <Box display="flex" alignItems="center">
<DocumentScannerIcon sx={{ mr: 1 }} /> <DocumentScannerIcon sx={{ mr: 1 }} />
<Typography> <Typography>
{`Take ${step.fullPage ? 'Fullpage' : 'Visible Part'} Screenshot`} {`Take ${step.fullPage ? 'Fullpage' : 'Visible Part'} Screenshot`}
</Typography> </Typography>
</Box>
)}
{step.type === 'list' && (
<>
<Typography>List Selected Successfully</Typography>
{Object.entries(step.fields).map(([key, field]) => (
<Box key={key}>
<TextField
label="Field Label"
value={field.label || ''}
onChange={() => { }}
fullWidth
margin="normal"
InputProps={{
startAdornment: (
<InputAdornment position="start">
<EditIcon />
</InputAdornment>
)
}}
/>
<TextField
label="Field Data"
value={field.data || ''}
fullWidth
margin="normal"
InputProps={{
readOnly: true,
startAdornment: (
<InputAdornment position="start">
<TextFieldsIcon />
</InputAdornment>
)
}}
/>
</Box> </Box>
) ))}
) </>
} )}
</Box> </Box>
))} ))}
</Box> </Box>
</Paper> </Paper>
); );
}; };
export const ActionDescription = styled.p`
margin-left: 15px;
`;

View File

@@ -2,9 +2,12 @@ import React, { createContext, useContext, useState, ReactNode } from 'react';
interface ActionContextProps { interface ActionContextProps {
getText: boolean; getText: boolean;
getList: boolean;
getScreenshot: boolean; getScreenshot: boolean;
startGetText: () => void; startGetText: () => void;
stopGetText: () => void; stopGetText: () => void;
startGetList: () => void;
stopGetList: () => void;
startGetScreenshot: () => void; startGetScreenshot: () => void;
stopGetScreenshot: () => void; stopGetScreenshot: () => void;
} }
@@ -13,16 +16,20 @@ const ActionContext = createContext<ActionContextProps | undefined>(undefined);
export const ActionProvider = ({ children }: { children: ReactNode }) => { export const ActionProvider = ({ children }: { children: ReactNode }) => {
const [getText, setGetText] = useState<boolean>(false); const [getText, setGetText] = useState<boolean>(false);
const [getList, setGetList] = useState<boolean>(false);
const [getScreenshot, setGetScreenshot] = useState<boolean>(false); const [getScreenshot, setGetScreenshot] = useState<boolean>(false);
const startGetText = () => setGetText(true); const startGetText = () => setGetText(true);
const stopGetText = () => setGetText(false); const stopGetText = () => setGetText(false);
const startGetList = () => setGetList(true);
const stopGetList = () => setGetList(false);
const startGetScreenshot = () => setGetScreenshot(true); const startGetScreenshot = () => setGetScreenshot(true);
const stopGetScreenshot = () => setGetScreenshot(false); const stopGetScreenshot = () => setGetScreenshot(false);
return ( return (
<ActionContext.Provider value={{ getText, getScreenshot, startGetText, stopGetText, startGetScreenshot, stopGetScreenshot }}> <ActionContext.Provider value={{ getText, getList, getScreenshot, startGetText, stopGetText, startGetList, stopGetList, startGetScreenshot, stopGetScreenshot }}>
{children} {children}
</ActionContext.Provider> </ActionContext.Provider>
); );
@@ -34,4 +41,4 @@ export const useActionContext = () => {
throw new Error('useActionContext must be used within an ActionProvider'); throw new Error('useActionContext must be used within an ActionProvider');
} }
return context; return context;
}; };

View File

@@ -1,6 +1,6 @@
import React, { createContext, useContext, useState } from 'react'; import React, { createContext, useContext, useState } from 'react';
interface TextStep { export interface TextStep {
id: number; id: number;
type: 'text'; type: 'text';
label: string; label: string;
@@ -14,10 +14,16 @@ interface ScreenshotStep {
fullPage: boolean; fullPage: boolean;
} }
export interface ListStep {
id: number;
type: 'list';
listSelector: string;
fields: { [key: string]: TextStep };
}
type BrowserStep = TextStep | ScreenshotStep; type BrowserStep = TextStep | ScreenshotStep | ListStep;
interface SelectorObject { export interface SelectorObject {
selector: string; selector: string;
tag?: string; tag?: string;
attribute?: string; attribute?: string;
@@ -27,6 +33,7 @@ interface SelectorObject {
interface BrowserStepsContextType { interface BrowserStepsContextType {
browserSteps: BrowserStep[]; browserSteps: BrowserStep[];
addTextStep: (label: string, data: string, selectorObj: SelectorObject) => void; addTextStep: (label: string, data: string, selectorObj: SelectorObject) => void;
addListStep: (listSelector: string, fields: { [key: string]: TextStep }) => void
addScreenshotStep: (fullPage: boolean) => void; addScreenshotStep: (fullPage: boolean) => void;
deleteBrowserStep: (id: number) => void; deleteBrowserStep: (id: number) => void;
updateBrowserTextStepLabel: (id: number, newLabel: string) => void; updateBrowserTextStepLabel: (id: number, newLabel: string) => void;
@@ -44,6 +51,31 @@ export const BrowserStepsProvider: React.FC<{ children: React.ReactNode }> = ({
]); ]);
}; };
const addListStep = (listSelector: string, newFields: { [key: string]: TextStep }) => {
setBrowserSteps(prevSteps => {
const existingListStepIndex = prevSteps.findIndex(
step => step.type === 'list' && step.listSelector === listSelector
);
if (existingListStepIndex !== -1) {
// Update the existing ListStep with new fields
const updatedSteps = [...prevSteps];
const existingListStep = updatedSteps[existingListStepIndex] as ListStep;
updatedSteps[existingListStepIndex] = {
...existingListStep,
fields: { ...existingListStep.fields, ...newFields }
};
return updatedSteps;
} else {
// Create a new ListStep
return [
...prevSteps,
{ id: Date.now(), type: 'list', listSelector, fields: newFields }
];
}
});
};
const addScreenshotStep = (fullPage: boolean) => { const addScreenshotStep = (fullPage: boolean) => {
setBrowserSteps(prevSteps => [ setBrowserSteps(prevSteps => [
...prevSteps, ...prevSteps,
@@ -67,6 +99,7 @@ export const BrowserStepsProvider: React.FC<{ children: React.ReactNode }> = ({
<BrowserStepsContext.Provider value={{ <BrowserStepsContext.Provider value={{
browserSteps, browserSteps,
addTextStep, addTextStep,
addListStep,
addScreenshotStep, addScreenshotStep,
deleteBrowserStep, deleteBrowserStep,
updateBrowserTextStepLabel, updateBrowserTextStepLabel,

View File

@@ -1,6 +1,7 @@
import React, { useCallback, useEffect, useState } from 'react'; import React, { useCallback, useEffect, useState } from 'react';
import { Grid } from '@mui/material'; import { Grid } from '@mui/material';
import { BrowserContent } from "../components/organisms/BrowserContent"; import { BrowserContent } from "../components/organisms/BrowserContent";
import { InterpretationLog } from "../components/molecules/InterpretationLog";
import { startRecording, getActiveBrowserId } from "../api/recording"; import { startRecording, getActiveBrowserId } from "../api/recording";
import { LeftSidePanel } from "../components/organisms/LeftSidePanel"; import { LeftSidePanel } from "../components/organisms/LeftSidePanel";
import { RightSidePanel } from "../components/organisms/RightSidePanel"; import { RightSidePanel } from "../components/organisms/RightSidePanel";
@@ -121,6 +122,7 @@ export const RecordingPage = ({ recordingName }: RecordingPageProps) => {
</Grid> </Grid>
<Grid id="browser-content" ref={browserContentRef} item xs> <Grid id="browser-content" ref={browserContentRef} item xs>
<BrowserContent /> <BrowserContent />
<InterpretationLog />
</Grid> </Grid>
<Grid item xs={2}> <Grid item xs={2}>
<RightSidePanel /> <RightSidePanel />

View File

@@ -23,4 +23,4 @@ export interface ScreenshotSettings {
type?: "jpeg" | "png"; type?: "jpeg" | "png";
}; };
export declare type CustomActions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag'; export declare type CustomActions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag' | 'scrapeList' | 'scrapeListAuto';