Merge pull request #22 from amhsirak/develop

feat: `scrapeList`  & `scrapeListAuto` actions
This commit is contained in:
Karishma Shukla
2024-08-21 21:49:37 +05:30
committed by GitHub
16 changed files with 700 additions and 81 deletions

View File

@@ -23,7 +23,11 @@
"author": "Karishma Shukla",
"license": "MIT",
"dependencies": {
"@cliqz/adblocker-playwright": "^1.31.3",
"cross-fetch": "^4.0.0",
"joi": "^17.6.0",
"playwright": "^1.20.1"
"playwright": "^1.20.1",
"playwright-extra": "^4.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2"
}
}

View File

@@ -126,6 +126,85 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
return out;
}
async function scrollDownToLoadMore(selector, limit) {
let previousHeight = 0;
let itemsLoaded = 0;
while (itemsLoaded < limit) {
window.scrollBy(0, window.innerHeight);
await new Promise(resolve => setTimeout(resolve, 1000));
const currentHeight = document.body.scrollHeight;
if (currentHeight === previousHeight) {
break; // No more items to load
}
previousHeight = currentHeight;
itemsLoaded += document.querySelectorAll(selector).length;
}
}
async function scrollUpToLoadMore(selector, limit) {
let previousHeight = 0;
let itemsLoaded = 0;
while (itemsLoaded < limit) {
window.scrollBy(0, -window.innerHeight);
await new Promise(resolve => setTimeout(resolve, 1000));
const currentHeight = document.body.scrollHeight;
if (currentHeight === previousHeight) {
break; // No more items to load
}
previousHeight = currentHeight;
itemsLoaded += document.querySelectorAll(selector).length;
}
}
async function clickNextPagination(selector, scrapedData, limit) {
// Check if the limit is already met
if (scrapedData.length >= limit) {
return false; // Return false to indicate no further action is needed
}
// Check if a single "Next" button exists
let nextButton = document.querySelector(selector);
if (nextButton) {
nextButton.click();
return true; // Indicate that pagination occurred
} else {
// Handle pagination with numbers
const paginationButtons = document.querySelectorAll(selector);
let clicked = false;
// Loop through pagination buttons to find the current active page
for (let i = 0; i < paginationButtons.length - 1; i++) {
const button = paginationButtons[i];
if (button.classList.contains('active')) {
// Click the next button if available
const nextButtonInPagination = paginationButtons[i + 1];
if (nextButtonInPagination) {
nextButtonInPagination.click();
clicked = true;
break;
}
}
}
// If no next button was clicked, we might be on the last page
if (!clicked) {
throw new Error("No more items to load or pagination has ended.");
}
return clicked; // Indicate whether pagination occurred
}
}
/**
* Returns a "scrape" result from the current page.
* @returns {Array<Object>} *Curated* array of scraped information (with sparse rows removed)
@@ -183,6 +262,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
};
/**
* TODO: Simplify.
* Given an object with named lists of elements,
* groups the elements by their distance in the DOM tree.
* @param {Object.<string, {selector: string, tag: string}>} lists The named lists of HTML elements.
@@ -250,4 +330,134 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
));
}
/**
* Scrapes multiple lists of similar items based on a template item.
* @param {Object} config - Configuration object
* @param {string} config.listSelector - Selector for the list container(s)
* @param {Object.<string, {selector: string, attribute?: string}>} config.fields - Fields to scrape
* @param {number} [config.limit] - Maximum number of items to scrape per list (optional)
* @param {boolean} [config.flexible=false] - Whether to use flexible matching for field selectors
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
*/
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
const scrapedData = [];
while (scrapedData.length < limit) {
// Get all parent elements matching the listSelector
const parentElements = Array.from(document.querySelectorAll(listSelector));
// Iterate through each parent element
for (const parent of parentElements) {
if (scrapedData.length >= limit) break;
const record = {};
// For each field, select the corresponding element within the parent
for (const [label, { selector, attribute }] of Object.entries(fields)) {
const fieldElement = parent.querySelector(selector);
if (fieldElement) {
if (attribute === 'innerText') {
record[label] = fieldElement.innerText.trim();
} else if (attribute === 'innerHTML') {
record[label] = fieldElement.innerHTML.trim();
} else if (attribute === 'src') {
record[label] = fieldElement.src;
} else if (attribute === 'href') {
record[label] = fieldElement.href;
} else {
record[label] = fieldElement.getAttribute(attribute);
}
}
}
scrapedData.push(record);
}
}
return scrapedData
};
/**
* Gets all children of the elements matching the listSelector,
* returning their CSS selectors and innerText.
* @param {string} listSelector - Selector for the list container(s)
* @returns {Array.<Object>} Array of objects, each containing the CSS selector and innerText of the children
*/
window.scrapeListAuto = function (listSelector) {
const lists = Array.from(document.querySelectorAll(listSelector));
const results = [];
lists.forEach(list => {
const children = Array.from(list.children);
children.forEach(child => {
const selectors = [];
let element = child;
// Traverse up to gather the CSS selector for the element
while (element && element !== document) {
let selector = element.nodeName.toLowerCase();
if (element.id) {
selector += `#${element.id}`;
selectors.push(selector);
break;
} else {
const className = element.className.trim().split(/\s+/).join('.');
if (className) {
selector += `.${className}`;
}
selectors.push(selector);
element = element.parentElement;
}
}
results.push({
selector: selectors.reverse().join(' > '),
innerText: child.innerText.trim()
});
});
});
return results;
};
window.scrollDown = async function (selector, limit) {
let previousHeight = 0;
let itemsLoaded = 0;
while (itemsLoaded < limit) {
window.scrollTo(0, document.body.scrollHeight);
await new Promise(resolve => setTimeout(resolve, 1000));
const currentHeight = document.body.scrollHeight;
if (currentHeight === previousHeight) {
break; // No more items to load
}
previousHeight = currentHeight;
itemsLoaded += document.querySelectorAll(selector).length;
}
}
window.scrollUp = async function (selector, limit) {
let previousHeight = 0;
let itemsLoaded = 0;
while (itemsLoaded < limit) {
window.scrollBy(0, -window.innerHeight);
await new Promise(resolve => setTimeout(resolve, 1000));
const currentHeight = document.body.scrollHeight;
if (currentHeight === previousHeight) {
break; // No more items to load
}
previousHeight = currentHeight;
itemsLoaded += document.querySelectorAll(selector).length;
}
}
})(window);

View File

@@ -1,5 +1,7 @@
/* eslint-disable no-await-in-loop, no-restricted-syntax */
import { Page, PageScreenshotOptions } from 'playwright';
import { PlaywrightBlocker } from '@cliqz/adblocker-playwright';
import fetch from 'cross-fetch';
import path from 'path';
import { EventEmitter } from 'events';
@@ -29,6 +31,7 @@ interface InterpreterOptions {
}>
}
/**
* Class for running the Smart Workflows.
*/
@@ -45,6 +48,8 @@ export default class Interpreter extends EventEmitter {
private log: typeof log;
private blocker: PlaywrightBlocker | null = null;
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>) {
super();
this.workflow = workflow.workflow;
@@ -76,6 +81,24 @@ export default class Interpreter extends EventEmitter {
oldLog(...args);
};
}
PlaywrightBlocker.fromPrebuiltAdsAndTracking(fetch).then(blocker => {
this.blocker = blocker;
}).catch(err => {
this.log(`Failed to initialize ad-blocker:`, Level.ERROR);
})
}
private async applyAdBlocker(page: Page): Promise<void> {
if (this.blocker) {
await this.blocker.enableBlockingInPage(page);
}
}
private async disableAdBlocker(page: Page): Promise<void> {
if (this.blocker) {
await this.blocker.disableBlockingInPage(page);
}
}
/**
@@ -285,11 +308,32 @@ export default class Interpreter extends EventEmitter {
scrapeSchema: async (schema: Record<string, { selector: string; tag: string, attribute: string; }>) => {
await this.ensureScriptsLoaded(page);
const scrapeResult = await page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
await this.options.serializableCallback(scrapeResult);
},
scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }) => {
await this.ensureScriptsLoaded(page);
if (!config.pagination) {
const scrapeResults: Record<string, any>[] = await page.evaluate((cfg) => window.scrapeList(cfg), config);
await this.options.serializableCallback(scrapeResults);
} else {
const scrapeResults: Record<string, any>[] = await this.handlePagination(page, config);
await this.options.serializableCallback(scrapeResults);
}
},
scrapeListAuto: async (config: { listSelector: string }) => {
await this.ensureScriptsLoaded(page);
const scrapeResults: { selector: string, innerText: string }[] = await page.evaluate((listSelector) => {
return window.scrapeListAuto(listSelector);
}, config.listSelector);
await this.options.serializableCallback(scrapeResults);
},
scroll: async (pages?: number) => {
await page.evaluate(async (pagesInternal) => {
for (let i = 1; i <= (pagesInternal ?? 1); i += 1) {
@@ -298,6 +342,7 @@ export default class Interpreter extends EventEmitter {
}
}, pages ?? 1);
},
script: async (code: string) => {
const AsyncFunction: FunctionConstructor = Object.getPrototypeOf(
async () => { },
@@ -305,6 +350,7 @@ export default class Interpreter extends EventEmitter {
const x = new AsyncFunction('page', 'log', code);
await x(page, this.log);
},
flag: async () => new Promise((res) => {
this.emit('flag', page, res);
}),
@@ -338,7 +384,82 @@ export default class Interpreter extends EventEmitter {
}
}
private async handlePagination(page: Page, config: { listSelector: string, fields: any, limit?: number, pagination: any }) {
let allResults: Record<string, any>[] = [];
let previousHeight = 0;
// track unique items per page to avoid re-scraping
let scrapedItems: Set<string> = new Set<string>();
while (true) {
switch (config.pagination.type) {
case 'scrollDown':
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(2000);
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) {
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
previousHeight = currentHeight;
break;
case 'scrollUp':
break;
case 'clickNext':
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
// Filter out already scraped items
const newResults = pageResults.filter(item => {
const uniqueKey = JSON.stringify(item);
if (scrapedItems.has(uniqueKey)) return false; // Ignore if already scraped
scrapedItems.add(uniqueKey); // Mark as scraped
return true;
});
allResults = allResults.concat(newResults);
if (config.limit && allResults.length >= config.limit) {
return allResults.slice(0, config.limit);
}
const nextButton = await page.$(config.pagination.selector);
if (!nextButton) {
return allResults; // No more pages to scrape
}
await Promise.all([
nextButton.click(),
page.waitForNavigation({ waitUntil: 'networkidle' })
]);
await page.waitForTimeout(1000);
break;
case 'clickLoadMore':
const loadMoreButton = await page.$(config.pagination.selector);
if (!loadMoreButton) {
return allResults;
}
await loadMoreButton.click();
break;
default:
const results = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(results);
return allResults;
}
if (config.limit && allResults.length >= config.limit) {
allResults = allResults.slice(0, config.limit);
break;
}
}
return allResults;
}
private async runLoop(p: Page, workflow: Workflow) {
// apply ad-blocker to the current page
await this.applyAdBlocker(p);
const usedActions: string[] = [];
let lastAction = null;
let repeatCount = 0;
@@ -404,13 +525,14 @@ export default class Interpreter extends EventEmitter {
this.log(<Error>e, Level.ERROR);
}
} else {
//await this.disableAdBlocker(p);
return;
}
}
}
private async ensureScriptsLoaded(page: Page) {
const isScriptLoaded = await page.evaluate(() => typeof window.scrape === 'function' && typeof window.scrapeSchema === 'function');
const isScriptLoaded = await page.evaluate(() => typeof window.scrape === 'function' && typeof window.scrapeSchema === 'function' && typeof window.scrapeList === 'function' && typeof window.scrapeListAuto === 'function' && typeof window.scrollDown === 'function' && typeof window.scrollUp === 'function');
if (!isScriptLoaded) {
await page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
}

View File

@@ -46,11 +46,11 @@ export default class Preprocessor {
return error;
}
/**
* Extracts parameter names from the workflow.
* @param {WorkflowFile} workflow The given workflow
* @returns {String[]} List of parameters' names.
*/
/**
* Extracts parameter names from the workflow.
* @param {WorkflowFile} workflow The given workflow
* @returns {String[]} List of parameters' names.
*/
static getParams(workflow: WorkflowFile): string[] {
const getParamsRecurse = (object: any): string[] => {
if (typeof object === 'object') {
@@ -69,10 +69,10 @@ export default class Preprocessor {
return getParamsRecurse(workflow.workflow);
}
/**
* List all the selectors used in the given workflow (only literal "selector"
* field in WHERE clauses so far)
*/
/**
* List all the selectors used in the given workflow (only literal "selector"
* field in WHERE clauses so far)
*/
// TODO : add recursive selector search (also in click/fill etc. events?)
static extractSelectors(workflow: Workflow): SelectorArray {
/**
@@ -107,11 +107,11 @@ export default class Preprocessor {
], []);
}
/**
* Recursively crawl `object` and initializes params - replaces the `{$param : paramName}` objects
* with the defined value.
* @returns {Workflow} Copy of the given workflow, modified (the initial workflow is left untouched).
*/
/**
* Recursively crawl `object` and initializes params - replaces the `{$param : paramName}` objects
* with the defined value.
* @returns {Workflow} Copy of the given workflow, modified (the initial workflow is left untouched).
*/
static initWorkflow(workflow: Workflow, params?: ParamType): Workflow {
const paramNames = this.getParams({ workflow });

View File

@@ -28,7 +28,7 @@ type MethodNames<T> = {
[K in keyof T]: T[K] extends Function ? K : never;
}[keyof T];
export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag';
export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag' | 'scrapeList' | 'scrapeListAuto';
export type What = {
action: MethodNames<Page> | CustomFunctions,

View File

@@ -7,6 +7,7 @@ import {
getElementInformation,
getRect,
getSelectors,
getNonUniqueSelectors,
isRuleOvershadowing,
selectorAlreadyInWorkflow
} from "../selector";
@@ -46,6 +47,12 @@ export class WorkflowGenerator {
*/
private socket: Socket;
/**
* getList is one of the custom actions from maxun-core.
* Used to provide appropriate selectors for the getList action.
*/
private getList: boolean = false;
/**
* The public constructor of the WorkflowGenerator.
* Takes socket for communication as a parameter and registers some important events on it.
@@ -55,6 +62,7 @@ export class WorkflowGenerator {
public constructor(socket: Socket) {
this.socket = socket;
this.registerEventHandlers(socket);
this.initializeSocketListeners();
}
/**
@@ -88,6 +96,15 @@ export class WorkflowGenerator {
lastAction: '',
}
/**
* Initializes the socket listeners for the generator.
*/
private initializeSocketListeners() {
this.socket.on('setGetList', (data: { getList: boolean }) => {
this.getList = data.getList;
});
}
/**
* Registers the event handlers for all generator-related events on the socket.
* @param socket The socket used to communicate with the client.
@@ -459,13 +476,17 @@ export class WorkflowGenerator {
*/
private generateSelector = async (page: Page, coordinates: Coordinates, action: ActionType) => {
const elementInfo = await getElementInformation(page, coordinates);
const selectorBasedOnCustomAction = (this.getList === true)
? await getNonUniqueSelectors(page, coordinates)
: await getSelectors(page, coordinates);
const bestSelector = getBestSelectorForAction(
{
type: action,
tagName: elementInfo?.tagName as TagName || '',
inputType: undefined,
value: undefined,
selectors: await getSelectors(page, coordinates) || {},
selectors: selectorBasedOnCustomAction || {},
timestamp: 0,
isPassword: false,
hasOnlyText: elementInfo?.hasOnlyText || false,
@@ -488,6 +509,8 @@ export class WorkflowGenerator {
if (rect) {
this.socket.emit('highlighter', { rect, selector: displaySelector, elementInfo });
}
// reset getList after usage
this.getList = false;
}
/**

View File

@@ -98,20 +98,20 @@ export const getElementInformation = async (
{ x: coordinates.x, y: coordinates.y },
);
if (elementInfo) {
if (elementInfo.tagName === 'A') {
if (elementInfo.innerText) {
console.log(`Link text: ${elementInfo.innerText}, URL: ${elementInfo.url}`);
} else {
console.log(`URL: ${elementInfo.url}`);
}
} else if (elementInfo.tagName === 'IMG') {
console.log(`Image URL: ${elementInfo.imageUrl}`);
} else {
console.log(`Element innerText: ${elementInfo.innerText}`);
}
}
// if (elementInfo) {
// if (elementInfo.tagName === 'A') {
// if (elementInfo.innerText) {
// console.log(`Link text: ${elementInfo.innerText}, URL: ${elementInfo.url}`);
// } else {
// console.log(`URL: ${elementInfo.url}`);
// }
// } else if (elementInfo.tagName === 'IMG') {
// console.log(`Image URL: ${elementInfo.imageUrl}`);
// } else {
// console.log(`Element innerText: ${elementInfo.innerText}`);
// }
// }
return elementInfo;
} catch (error) {
const { message, stack } = error as Error;
@@ -721,6 +721,66 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => {
return null;
};
/**
* Returns the best non-unique css {@link Selectors} for the element on the page.
* @param page The page instance.
* @param coordinates Coordinates of an element.
* @category WorkflowManagement-Selectors
* @returns {Promise<Selectors|null|undefined>}
*/
export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates) => {
try {
const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => {
function getNonUniqueSelector(element: HTMLElement): string {
let selector = element.tagName.toLowerCase();
// Avoid using IDs to maintain non-uniqueness
if (element.className) {
const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls));
if (classes.length > 0) {
// Exclude utility classes and escape special characters
const validClasses = classes.filter((cls: string) => !cls.startsWith('!') && !cls.includes(':'));
if (validClasses.length > 0) {
selector += '.' + validClasses.map(cls => CSS.escape(cls)).join('.');
}
}
}
return selector;
}
function getSelectorPath(element: HTMLElement | null): string {
const path: string[] = [];
while (element && element !== document.body) {
const selector = getNonUniqueSelector(element);
path.unshift(selector);
element = element.parentElement;
}
return path.join(' > ');
}
const element = document.elementFromPoint(x, y) as HTMLElement | null;
if (!element) return null;
const generalSelector = getSelectorPath(element);
return {
generalSelector,
};
}, coordinates);
return selectors || {};
} catch (error) {
console.error('Error in getNonUniqueSelectors:', error);
return {};
}
};
/**
* Returns the first pair from the given workflow that contains the given selector
* inside the where condition, and it is the only selector there.

View File

@@ -24,10 +24,6 @@ export const Highlighter = ({ unmodifiedRect, displayedSelector = '', width, hei
};
//console.log('unmodifiedRect:', unmodifiedRect)
//console.log('rectangle:', rect)
//console.log('canvas rectangle:', canvasRect)
return (
<div>
<HighlighterOutline
@@ -54,7 +50,7 @@ const HighlighterOutline = styled.div<HighlighterOutlineProps>`
pointer-events: none !important;
position: fixed !important;
background: #ff5d5b26 !important;
outline: 4px solid pink !important;
outline: 4px solid red !important;
//border: 4px solid #ff5d5b !important;
z-index: 2147483647 !important;
//border-radius: 5px;

View File

@@ -27,8 +27,9 @@ const Canvas = ({ width, height, onCreateRef }: CanvasProps) => {
const canvasRef = useRef<HTMLCanvasElement>(null);
const { socket } = useSocketStore();
const { setLastAction, lastAction } = useGlobalInfoStore();
const { getText, getScreenshot } = useActionContext();
const { getText, getList } = useActionContext();
const getTextRef = useRef(getText);
const getListRef = useRef(getList);
const notifyLastAction = (action: string) => {
if (lastAction !== action) {
@@ -40,7 +41,8 @@ const Canvas = ({ width, height, onCreateRef }: CanvasProps) => {
useEffect(() => {
getTextRef.current = getText;
}, [getText]);
getListRef.current = getList;
}, [getText, getList]);
const onMouseEvent = useCallback((event: MouseEvent) => {
if (socket) {
@@ -51,8 +53,9 @@ const Canvas = ({ width, height, onCreateRef }: CanvasProps) => {
switch (event.type) {
case 'mousedown':
const clickCoordinates = getMappedCoordinates(event, canvasRef.current, width, height);
if (getTextRef.current === true) {
console.log('get text')
if (getTextRef.current === true || getListRef.current === true) {
// todo: remove console.log and return
console.log('get text or get list is true');
} else {
socket.emit('input:mousedown', clickCoordinates);
}

View File

@@ -27,7 +27,7 @@ export const InterpretationLog = () => {
}
const handleLog = useCallback((msg: string, date: boolean = true) => {
if (!date){
if (!date) {
setLog((prevState) => prevState + '\n' + msg);
} else {
setLog((prevState) => prevState + '\n' + `[${new Date().toLocaleString()}] ` + msg);
@@ -42,9 +42,9 @@ export const InterpretationLog = () => {
scrollLogToBottom();
}, [log, scrollLogToBottom])
const handleBinaryCallback = useCallback(({data, mimetype}: any) => {
const handleBinaryCallback = useCallback(({ data, mimetype }: any) => {
setLog((prevState) =>
prevState + '\n' + '---------- Binary output data received ----------' + '\n'
prevState + '\n' + '---------- Binary output data received ----------' + '\n'
+ `mimetype: ${mimetype}` + '\n' + `data: ${JSON.stringify(data)}` + '\n'
+ '------------------------------------------------');
scrollLogToBottom();
@@ -66,10 +66,10 @@ export const InterpretationLog = () => {
<Accordion
expanded={expanded}
onChange={handleChange(!expanded)}
style={{background: '#3f4853', color: 'white', borderRadius: '0px'}}
style={{ background: '#3f4853', color: 'white', borderRadius: '0px' }}
>
<AccordionSummary
expandIcon={<ExpandMoreIcon sx={{color: 'white'}}/>}
expandIcon={<ExpandMoreIcon sx={{ color: 'white' }} />}
aria-controls="panel1bh-content"
id="panel1bh-header"
>
@@ -88,8 +88,8 @@ export const InterpretationLog = () => {
<Highlight className="javascript">
{log}
</Highlight>
<div style={{ float:"left", clear: "both" }}
ref={logEndRef}/>
<div style={{ float: "left", clear: "both" }}
ref={logEndRef} />
</div>
</AccordionDetails>
</Accordion>

View File

@@ -5,7 +5,7 @@ import { useBrowserDimensionsStore } from "../../context/browserDimensions";
import { Highlighter } from "../atoms/Highlighter";
import { GenericModal } from '../atoms/GenericModal';
import { useActionContext } from '../../context/browserActions';
import { useBrowserSteps } from '../../context/browserSteps';
import { useBrowserSteps, TextStep } from '../../context/browserSteps';
interface ElementInfo {
tagName: string;
@@ -45,10 +45,13 @@ export const BrowserWindow = () => {
const [attributeOptions, setAttributeOptions] = useState<AttributeOption[]>([]);
const [selectedElement, setSelectedElement] = useState<{ selector: string, info: ElementInfo | null } | null>(null);
const [listSelector, setListSelector] = useState<string | null>(null);
const [fields, setFields] = useState<Record<string, TextStep>>({});
const { socket } = useSocketStore();
const { width, height } = useBrowserDimensionsStore();
const { getText } = useActionContext();
const { addTextStep } = useBrowserSteps();
const { getText, getList } = useActionContext();
const { addTextStep, addListStep } = useBrowserSteps();
const onMouseMove = (e: MouseEvent) => {
if (canvasRef && canvasRef.current && highlighterData) {
@@ -84,8 +87,11 @@ export const BrowserWindow = () => {
}, [screenShot, canvasRef, socket, screencastHandler]);
const highlighterHandler = useCallback((data: { rect: DOMRect, selector: string, elementInfo: ElementInfo | null }) => {
if (getList === true) {
socket?.emit('setGetList', { getList: true });
}
setHighlighterData(data);
}, [highlighterData])
}, [highlighterData, getList, socket]);
useEffect(() => {
document.addEventListener('mousemove', onMouseMove, false);
@@ -128,6 +134,45 @@ export const BrowserWindow = () => {
});
}
}
if (getList === true && !listSelector) {
setListSelector(highlighterData.selector);
} else if (getList === true && listSelector) {
const options = getAttributeOptions(highlighterData.elementInfo?.tagName || '');
if (options.length > 1) {
setAttributeOptions(options);
setSelectedElement({
selector: highlighterData.selector,
info: highlighterData.elementInfo
});
setShowAttributeModal(true);
} else {
const newField: TextStep = {
id: Date.now(),
type: 'text',
label: `Label ${Object.keys(fields).length + 1}`,
data: highlighterData.elementInfo?.innerText || '',
selectorObj: {
selector: highlighterData.selector,
tag: highlighterData.elementInfo?.tagName,
attribute: 'innerText'
}
};
setFields(prevFields => {
const updatedFields = {
...prevFields,
[newField.label]: newField
};
return updatedFields;
});
if (listSelector) {
addListStep(listSelector, { ...fields, [newField.label]: newField });
}
}
}
}
}
};
@@ -153,6 +198,31 @@ export const BrowserWindow = () => {
attribute: attribute
});
}
if (getList === true) {
const newField: TextStep = {
id: Date.now(),
type: 'text',
label: `Label ${Object.keys(fields).length + 1}`,
data: selectedElement.info?.innerText || '',
selectorObj: {
selector: selectedElement.selector,
tag: selectedElement.info?.tagName,
attribute: attribute
}
};
setFields(prevFields => {
const updatedFields = {
...prevFields,
[newField.label]: newField
};
return updatedFields;
});
if (listSelector) {
addListStep(listSelector, { ...fields, [newField.label]: newField });
}
}
}
}
setShowAttributeModal(false);
@@ -161,7 +231,7 @@ export const BrowserWindow = () => {
return (
<div onClick={handleClick}>
{
getText === true ? (
getText === true || getList === true ? (
<GenericModal
isOpen={showAttributeModal}
onClose={() => { }}
@@ -179,7 +249,7 @@ export const BrowserWindow = () => {
</GenericModal>
) : null
}
{(getText === true && !showAttributeModal && highlighterData?.rect != null && highlighterData?.rect.top != null) && canvasRef?.current ?
{((getText === true || getList === true) && !showAttributeModal && highlighterData?.rect != null && highlighterData?.rect.top != null) && canvasRef?.current ?
<Highlighter
unmodifiedRect={highlighterData?.rect}
displayedSelector={highlighterData?.selector}

View File

@@ -8,11 +8,16 @@ import { SimpleBox } from "../atoms/Box";
import Typography from "@mui/material/Typography";
import { useGlobalInfoStore } from "../../context/globalInfo";
import { useActionContext } from '../../context/browserActions';
import { useBrowserSteps } from '../../context/browserSteps';
import { useBrowserSteps, ListStep, TextStep, SelectorObject } from '../../context/browserSteps';
import { useSocketStore } from '../../context/socket';
import { ScreenshotSettings } from '../../shared/types';
import InputAdornment from '@mui/material/InputAdornment';
// TODO:
// 1. Handle field label update
// 2. Handle field deletion | confirmation
// 3. Add description for each browser step
// 4. Handle non custom action steps
export const RightSidePanel = () => {
const [textLabels, setTextLabels] = useState<{ [id: number]: string }>({});
@@ -20,7 +25,7 @@ export const RightSidePanel = () => {
const [confirmedTextSteps, setConfirmedTextSteps] = useState<{ [id: number]: boolean }>({});
const { lastAction, notify } = useGlobalInfoStore();
const { getText, startGetText, stopGetText, getScreenshot, startGetScreenshot, stopGetScreenshot } = useActionContext();
const { getText, startGetText, stopGetText, getScreenshot, startGetScreenshot, stopGetScreenshot, getList, startGetList, stopGetList } = useActionContext();
const { browserSteps, updateBrowserTextStepLabel, deleteBrowserStep, addScreenshotStep } = useBrowserSteps();
const { socket } = useSocketStore();
@@ -80,6 +85,49 @@ export const RightSidePanel = () => {
}
}, [stopGetText, getTextSettingsObject, socket, browserSteps, confirmedTextSteps]);
const getListSettingsObject = useCallback(() => {
let settings: { listSelector?: string; fields?: Record<string, { selector: string; tag?: string;[key: string]: any }> } = {};
browserSteps.forEach(step => {
if (step.type === 'list' && step.listSelector && Object.keys(step.fields).length > 0) {
const fields: Record<string, { selector: string; tag?: string;[key: string]: any }> = {};
Object.entries(step.fields).forEach(([label, field]) => {
if (field.selectorObj?.selector) {
fields[label] = {
selector: field.selectorObj.selector,
tag: field.selectorObj.tag,
attribute: field.selectorObj.attribute
};
}
});
settings = {
listSelector: step.listSelector,
fields: fields
};
}
});
return settings;
}, [browserSteps]);
const stopCaptureAndEmitGetListSettings = useCallback(() => {
stopGetList();
const settings = getListSettingsObject();
if (settings) {
socket?.emit('action', { action: 'scrapeList', settings });
} else {
notify('error', 'Unable to create list settings. Make sure you have defined a field for the list.');
}
}, [stopGetList, getListSettingsObject, socket, notify]);
// const handleListFieldChange = (stepId: number, key: 'label' | 'data', value: string) => {
// updateListStepField(stepId, key, value);
// };
const captureScreenshot = (fullPage: boolean) => {
const screenshotSettings: ScreenshotSettings = {
fullPage,
@@ -101,7 +149,17 @@ export const RightSidePanel = () => {
</SimpleBox>
<Box display="flex" flexDirection="column" gap={2} style={{ margin: '15px' }}>
{!getText && !getScreenshot && <Button variant="contained" onClick={startGetText}>Capture Text</Button>}
{!getText && !getScreenshot && !getList && <Button variant="contained" onClick={startGetList}>Capture List</Button>}
{getList &&
<>
<Box display="flex" justifyContent="space-between" gap={2} style={{ margin: '15px' }}>
<Button variant="outlined" onClick={stopCaptureAndEmitGetListSettings}>Confirm</Button>
<Button variant="outlined" color="error" onClick={stopGetList}>Discard</Button>
</Box>
</>
}
{!getText && !getScreenshot && !getList && <Button variant="contained" onClick={startGetText}>Capture Text</Button>}
{getText &&
<>
<Box display="flex" justifyContent="space-between" gap={2} style={{ margin: '15px' }}>
@@ -111,7 +169,7 @@ export const RightSidePanel = () => {
</>
}
{!getText && !getScreenshot && <Button variant="contained" onClick={startGetScreenshot}>Capture Screenshot</Button>}
{!getText && !getScreenshot && !getList && <Button variant="contained" onClick={startGetScreenshot}>Capture Screenshot</Button>}
{getScreenshot && (
<Box display="flex" flexDirection="column" gap={2}>
<Button variant="contained" onClick={() => captureScreenshot(true)}>Capture Fullpage</Button>
@@ -125,7 +183,7 @@ export const RightSidePanel = () => {
{browserSteps.map(step => (
<Box key={step.id} sx={{ boxShadow: 5, padding: '10px', margin: '10px', borderRadius: '4px' }}>
{
step.type === 'text' ? (
step.type === 'text' && (
<>
<TextField
label="Label"
@@ -165,24 +223,55 @@ export const RightSidePanel = () => {
</Box>
)}
</>
) : (
step.type === 'screenshot' && (
<Box display="flex" alignItems="center">
<DocumentScannerIcon sx={{ mr: 1 }} />
<Typography>
{`Take ${step.fullPage ? 'Fullpage' : 'Visible Part'} Screenshot`}
</Typography>
)}
{step.type === 'screenshot' && (
<Box display="flex" alignItems="center">
<DocumentScannerIcon sx={{ mr: 1 }} />
<Typography>
{`Take ${step.fullPage ? 'Fullpage' : 'Visible Part'} Screenshot`}
</Typography>
</Box>
)}
{step.type === 'list' && (
<>
<Typography>List Selected Successfully</Typography>
{Object.entries(step.fields).map(([key, field]) => (
<Box key={key}>
<TextField
label="Field Label"
value={field.label || ''}
onChange={() => { }}
fullWidth
margin="normal"
InputProps={{
startAdornment: (
<InputAdornment position="start">
<EditIcon />
</InputAdornment>
)
}}
/>
<TextField
label="Field Data"
value={field.data || ''}
fullWidth
margin="normal"
InputProps={{
readOnly: true,
startAdornment: (
<InputAdornment position="start">
<TextFieldsIcon />
</InputAdornment>
)
}}
/>
</Box>
)
)
}
))}
</>
)}
</Box>
))}
</Box>
</Paper>
);
};
export const ActionDescription = styled.p`
margin-left: 15px;
`;
};

View File

@@ -2,9 +2,12 @@ import React, { createContext, useContext, useState, ReactNode } from 'react';
interface ActionContextProps {
getText: boolean;
getList: boolean;
getScreenshot: boolean;
startGetText: () => void;
stopGetText: () => void;
startGetList: () => void;
stopGetList: () => void;
startGetScreenshot: () => void;
stopGetScreenshot: () => void;
}
@@ -13,16 +16,20 @@ const ActionContext = createContext<ActionContextProps | undefined>(undefined);
export const ActionProvider = ({ children }: { children: ReactNode }) => {
const [getText, setGetText] = useState<boolean>(false);
const [getList, setGetList] = useState<boolean>(false);
const [getScreenshot, setGetScreenshot] = useState<boolean>(false);
const startGetText = () => setGetText(true);
const stopGetText = () => setGetText(false);
const startGetList = () => setGetList(true);
const stopGetList = () => setGetList(false);
const startGetScreenshot = () => setGetScreenshot(true);
const stopGetScreenshot = () => setGetScreenshot(false);
return (
<ActionContext.Provider value={{ getText, getScreenshot, startGetText, stopGetText, startGetScreenshot, stopGetScreenshot }}>
<ActionContext.Provider value={{ getText, getList, getScreenshot, startGetText, stopGetText, startGetList, stopGetList, startGetScreenshot, stopGetScreenshot }}>
{children}
</ActionContext.Provider>
);
@@ -34,4 +41,4 @@ export const useActionContext = () => {
throw new Error('useActionContext must be used within an ActionProvider');
}
return context;
};
};

View File

@@ -1,6 +1,6 @@
import React, { createContext, useContext, useState } from 'react';
interface TextStep {
export interface TextStep {
id: number;
type: 'text';
label: string;
@@ -14,10 +14,16 @@ interface ScreenshotStep {
fullPage: boolean;
}
export interface ListStep {
id: number;
type: 'list';
listSelector: string;
fields: { [key: string]: TextStep };
}
type BrowserStep = TextStep | ScreenshotStep;
type BrowserStep = TextStep | ScreenshotStep | ListStep;
interface SelectorObject {
export interface SelectorObject {
selector: string;
tag?: string;
attribute?: string;
@@ -27,6 +33,7 @@ interface SelectorObject {
interface BrowserStepsContextType {
browserSteps: BrowserStep[];
addTextStep: (label: string, data: string, selectorObj: SelectorObject) => void;
addListStep: (listSelector: string, fields: { [key: string]: TextStep }) => void
addScreenshotStep: (fullPage: boolean) => void;
deleteBrowserStep: (id: number) => void;
updateBrowserTextStepLabel: (id: number, newLabel: string) => void;
@@ -44,6 +51,31 @@ export const BrowserStepsProvider: React.FC<{ children: React.ReactNode }> = ({
]);
};
const addListStep = (listSelector: string, newFields: { [key: string]: TextStep }) => {
setBrowserSteps(prevSteps => {
const existingListStepIndex = prevSteps.findIndex(
step => step.type === 'list' && step.listSelector === listSelector
);
if (existingListStepIndex !== -1) {
// Update the existing ListStep with new fields
const updatedSteps = [...prevSteps];
const existingListStep = updatedSteps[existingListStepIndex] as ListStep;
updatedSteps[existingListStepIndex] = {
...existingListStep,
fields: { ...existingListStep.fields, ...newFields }
};
return updatedSteps;
} else {
// Create a new ListStep
return [
...prevSteps,
{ id: Date.now(), type: 'list', listSelector, fields: newFields }
];
}
});
};
const addScreenshotStep = (fullPage: boolean) => {
setBrowserSteps(prevSteps => [
...prevSteps,
@@ -67,6 +99,7 @@ export const BrowserStepsProvider: React.FC<{ children: React.ReactNode }> = ({
<BrowserStepsContext.Provider value={{
browserSteps,
addTextStep,
addListStep,
addScreenshotStep,
deleteBrowserStep,
updateBrowserTextStepLabel,

View File

@@ -1,6 +1,7 @@
import React, { useCallback, useEffect, useState } from 'react';
import { Grid } from '@mui/material';
import { BrowserContent } from "../components/organisms/BrowserContent";
import { InterpretationLog } from "../components/molecules/InterpretationLog";
import { startRecording, getActiveBrowserId } from "../api/recording";
import { LeftSidePanel } from "../components/organisms/LeftSidePanel";
import { RightSidePanel } from "../components/organisms/RightSidePanel";
@@ -121,6 +122,7 @@ export const RecordingPage = ({ recordingName }: RecordingPageProps) => {
</Grid>
<Grid id="browser-content" ref={browserContentRef} item xs>
<BrowserContent />
<InterpretationLog />
</Grid>
<Grid item xs={2}>
<RightSidePanel />

View File

@@ -23,4 +23,4 @@ export interface ScreenshotSettings {
type?: "jpeg" | "png";
};
export declare type CustomActions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag';
export declare type CustomActions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag' | 'scrapeList' | 'scrapeListAuto';