feat: add server side crawl and search logic
This commit is contained in:
@@ -16,7 +16,6 @@ function processWorkflow(workflow: WorkflowFile, checkLimit: boolean = false): W
|
||||
|
||||
processedWorkflow.workflow.forEach((pair) => {
|
||||
pair.what.forEach((action) => {
|
||||
// Handle limit validation for scrapeList action
|
||||
if (action.action === 'scrapeList' && checkLimit && Array.isArray(action.args) && action.args.length > 0) {
|
||||
const scrapeConfig = action.args[0];
|
||||
if (scrapeConfig && typeof scrapeConfig === 'object' && 'limit' in scrapeConfig) {
|
||||
@@ -26,7 +25,6 @@ function processWorkflow(workflow: WorkflowFile, checkLimit: boolean = false): W
|
||||
}
|
||||
}
|
||||
|
||||
// Handle decryption for type and press actions
|
||||
if ((action.action === 'type' || action.action === 'press') && Array.isArray(action.args) && action.args.length > 1) {
|
||||
try {
|
||||
const encryptedValue = action.args[1];
|
||||
@@ -93,10 +91,14 @@ export class WorkflowInterpreter {
|
||||
public serializableDataByType: {
|
||||
scrapeSchema: Record<string, any>;
|
||||
scrapeList: Record<string, any>;
|
||||
crawl: Record<string, any>;
|
||||
search: Record<string, any>;
|
||||
[key: string]: any;
|
||||
} = {
|
||||
scrapeSchema: {},
|
||||
scrapeList: {},
|
||||
crawl: {},
|
||||
search: {},
|
||||
};
|
||||
|
||||
private currentActionName: string | null = null;
|
||||
@@ -282,7 +284,6 @@ export class WorkflowInterpreter {
|
||||
}
|
||||
} else if (this.currentActionType === 'scrapeList') {
|
||||
if (data && Array.isArray(data) && data.length > 0) {
|
||||
// Use the current index for persistence
|
||||
await this.persistDataToDatabase('scrapeList', data, this.currentScrapeListIndex);
|
||||
}
|
||||
|
||||
@@ -293,7 +294,6 @@ export class WorkflowInterpreter {
|
||||
}
|
||||
},
|
||||
binaryCallback: async (data: string, mimetype: string) => {
|
||||
// For editor mode, we don't have the name yet, so use a timestamp-based name
|
||||
const binaryItem = {
|
||||
name: `Screenshot ${Date.now()}`,
|
||||
mimeType: mimetype,
|
||||
@@ -301,7 +301,6 @@ export class WorkflowInterpreter {
|
||||
};
|
||||
this.binaryData.push(binaryItem);
|
||||
|
||||
// Persist binary data to database
|
||||
await this.persistBinaryDataToDatabase(binaryItem);
|
||||
|
||||
this.socket.emit('binaryCallback', {
|
||||
@@ -340,7 +339,6 @@ export class WorkflowInterpreter {
|
||||
|
||||
logger.log('debug', `Interpretation finished`);
|
||||
|
||||
// Flush any remaining data in persistence buffer before completing
|
||||
await this.flushPersistenceBuffer();
|
||||
|
||||
this.interpreter = null;
|
||||
@@ -419,6 +417,8 @@ export class WorkflowInterpreter {
|
||||
this.serializableDataByType = {
|
||||
scrapeSchema: {},
|
||||
scrapeList: {},
|
||||
crawl: {},
|
||||
search: {},
|
||||
};
|
||||
this.binaryData = [];
|
||||
this.currentScrapeListIndex = 0;
|
||||
@@ -591,12 +591,20 @@ export class WorkflowInterpreter {
|
||||
typeKey = "scrapeList";
|
||||
} else if (this.currentActionType === "scrapeSchema") {
|
||||
typeKey = "scrapeSchema";
|
||||
} else if (this.currentActionType === "crawl") {
|
||||
typeKey = "crawl";
|
||||
} else if (this.currentActionType === "search") {
|
||||
typeKey = "search";
|
||||
}
|
||||
|
||||
if (typeKey === "scrapeList" && data.scrapeList) {
|
||||
data = data.scrapeList;
|
||||
} else if (typeKey === "scrapeSchema" && data.scrapeSchema) {
|
||||
data = data.scrapeSchema;
|
||||
} else if (typeKey === "crawl" && data.crawl) {
|
||||
data = data.crawl;
|
||||
} else if (typeKey === "search" && data.search) {
|
||||
data = data.search;
|
||||
}
|
||||
|
||||
let actionName = "";
|
||||
@@ -609,38 +617,65 @@ export class WorkflowInterpreter {
|
||||
actionName = keys[keys.length - 1];
|
||||
data = data[actionName];
|
||||
}
|
||||
} else if (typeKey === "crawl" && data && typeof data === "object" && !Array.isArray(data)) {
|
||||
const keys = Object.keys(data);
|
||||
if (keys.length === 1) {
|
||||
actionName = keys[0];
|
||||
data = data[actionName];
|
||||
} else if (keys.length > 1) {
|
||||
actionName = keys[keys.length - 1];
|
||||
data = data[actionName];
|
||||
}
|
||||
} else if (typeKey === "search" && data && typeof data === "object" && !Array.isArray(data)) {
|
||||
const keys = Object.keys(data);
|
||||
if (keys.length === 1) {
|
||||
actionName = keys[0];
|
||||
data = data[actionName];
|
||||
} else if (keys.length > 1) {
|
||||
actionName = keys[keys.length - 1];
|
||||
data = data[actionName];
|
||||
}
|
||||
}
|
||||
|
||||
if (!actionName) {
|
||||
actionName = this.currentActionName || "";
|
||||
if (typeKey === "scrapeList" && !actionName) {
|
||||
actionName = this.getUniqueActionName(typeKey, "");
|
||||
} else if (typeKey === "crawl" && !actionName) {
|
||||
actionName = this.getUniqueActionName(typeKey, "Crawl Results");
|
||||
} else if (typeKey === "search" && !actionName) {
|
||||
actionName = this.getUniqueActionName(typeKey, "Search Results");
|
||||
}
|
||||
}
|
||||
|
||||
const flattened = Array.isArray(data)
|
||||
? data
|
||||
: (
|
||||
data?.List ??
|
||||
(data && typeof data === "object"
|
||||
? Object.values(data).flat?.() ?? data
|
||||
: [])
|
||||
);
|
||||
let processedData;
|
||||
if (typeKey === "search") {
|
||||
processedData = data;
|
||||
} else {
|
||||
processedData = Array.isArray(data)
|
||||
? data
|
||||
: (
|
||||
data?.List ??
|
||||
(data && typeof data === "object"
|
||||
? Object.values(data).flat?.() ?? data
|
||||
: [])
|
||||
);
|
||||
}
|
||||
|
||||
if (!this.serializableDataByType[typeKey]) {
|
||||
this.serializableDataByType[typeKey] = {};
|
||||
}
|
||||
|
||||
this.serializableDataByType[typeKey][actionName] = flattened;
|
||||
this.serializableDataByType[typeKey][actionName] = processedData;
|
||||
|
||||
await this.persistDataToDatabase(typeKey, {
|
||||
[actionName]: flattened,
|
||||
[actionName]: processedData,
|
||||
});
|
||||
|
||||
this.socket.emit("serializableCallback", {
|
||||
type: typeKey,
|
||||
name: actionName,
|
||||
data: flattened,
|
||||
data: processedData,
|
||||
});
|
||||
} catch (err: any) {
|
||||
logger.log('error', `serializableCallback handler failed: ${err.message}`);
|
||||
@@ -698,7 +733,6 @@ export class WorkflowInterpreter {
|
||||
|
||||
await this.flushPersistenceBuffer();
|
||||
|
||||
// Structure the output to maintain separate data for each action type
|
||||
const result = {
|
||||
log: this.debugMessages,
|
||||
result: status,
|
||||
@@ -794,7 +828,7 @@ export class WorkflowInterpreter {
|
||||
|
||||
const currentSerializableOutput = run.serializableOutput ?
|
||||
JSON.parse(JSON.stringify(run.serializableOutput)) :
|
||||
{ scrapeSchema: [], scrapeList: [] };
|
||||
{ scrapeSchema: {}, scrapeList: {}, crawl: {}, search: {} };
|
||||
|
||||
if (Array.isArray(currentSerializableOutput.scrapeList)) {
|
||||
currentSerializableOutput.scrapeList = {};
|
||||
@@ -802,6 +836,9 @@ export class WorkflowInterpreter {
|
||||
if (Array.isArray(currentSerializableOutput.scrapeSchema)) {
|
||||
currentSerializableOutput.scrapeSchema = {};
|
||||
}
|
||||
if (!currentSerializableOutput.search) {
|
||||
currentSerializableOutput.search = {};
|
||||
}
|
||||
|
||||
let hasUpdates = false;
|
||||
|
||||
@@ -827,6 +864,18 @@ export class WorkflowInterpreter {
|
||||
}
|
||||
mergeLists(currentSerializableOutput.scrapeList, item.data);
|
||||
hasUpdates = true;
|
||||
} else if (item.actionType === 'crawl') {
|
||||
currentSerializableOutput.crawl = {
|
||||
...(currentSerializableOutput.crawl || {}),
|
||||
...item.data
|
||||
};
|
||||
hasUpdates = true;
|
||||
} else if (item.actionType === 'search') {
|
||||
currentSerializableOutput.search = {
|
||||
...(currentSerializableOutput.search || {}),
|
||||
...item.data
|
||||
};
|
||||
hasUpdates = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -13,7 +13,11 @@ interface AirtableUpdateTask {
|
||||
|
||||
interface SerializableOutput {
|
||||
scrapeSchema?: Record<string, any[]>;
|
||||
scrapeList?: Record<string, any[]>;
|
||||
scrapeList?: Record<string, any[]>;
|
||||
markdown?: Array<{ content: string }>;
|
||||
html?: Array<{ content: string }>;
|
||||
crawl?: Record<string, any[]>;
|
||||
search?: any;
|
||||
}
|
||||
|
||||
const MAX_RETRIES = 3;
|
||||
@@ -67,6 +71,10 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
|
||||
const schemaData: Array<{ Group: string; Field: string; Value: any }> = [];
|
||||
const listData: any[] = [];
|
||||
const screenshotData: Array<{ key: string; url: string }> = [];
|
||||
const markdownData: any[] = [];
|
||||
const htmlData: any[] = [];
|
||||
const crawlData: any[] = [];
|
||||
const searchData: any[] = [];
|
||||
|
||||
if (serializableOutput.scrapeSchema) {
|
||||
if (Array.isArray(serializableOutput.scrapeSchema)) {
|
||||
@@ -122,6 +130,66 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
|
||||
}
|
||||
}
|
||||
|
||||
if (serializableOutput.markdown && Array.isArray(serializableOutput.markdown)) {
|
||||
serializableOutput.markdown.forEach((item, index) => {
|
||||
if (item.content) {
|
||||
markdownData.push({
|
||||
"Index": index + 1,
|
||||
"Type": "Markdown",
|
||||
"Content": item.content
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (serializableOutput.html && Array.isArray(serializableOutput.html)) {
|
||||
serializableOutput.html.forEach((item, index) => {
|
||||
if (item.content) {
|
||||
htmlData.push({
|
||||
"Index": index + 1,
|
||||
"Type": "HTML",
|
||||
"Content": item.content
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (serializableOutput.crawl && typeof serializableOutput.crawl === "object") {
|
||||
for (const [crawlName, crawlArray] of Object.entries(serializableOutput.crawl)) {
|
||||
if (Array.isArray(crawlArray)) {
|
||||
crawlArray.forEach((crawlItem) => {
|
||||
const hasContent = Object.values(crawlItem || {}).some(
|
||||
(value) => value !== null && value !== undefined && value !== ""
|
||||
);
|
||||
if (hasContent) {
|
||||
crawlData.push({ "Crawl Type": crawlName, ...crawlItem });
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (serializableOutput.search) {
|
||||
let results: any[] = [];
|
||||
|
||||
if (serializableOutput.search.results && Array.isArray(serializableOutput.search.results)) {
|
||||
results = serializableOutput.search.results;
|
||||
} else if (Array.isArray(serializableOutput.search)) {
|
||||
results = serializableOutput.search;
|
||||
} else {
|
||||
results = [serializableOutput.search];
|
||||
}
|
||||
|
||||
results.forEach((result) => {
|
||||
const hasContent = Object.values(result || {}).some(
|
||||
(value) => value !== null && value !== undefined && value !== ""
|
||||
);
|
||||
if (hasContent) {
|
||||
searchData.push(result);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Collect screenshot data (handles both string and object forms safely)
|
||||
// if (binaryOutput && Object.keys(binaryOutput).length > 0) {
|
||||
// Object.entries(binaryOutput).forEach(([key, rawValue]: [string, any]) => {
|
||||
@@ -152,7 +220,15 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
|
||||
// }
|
||||
|
||||
// --- Merge all types into Airtable rows ---
|
||||
const maxLength = Math.max(schemaData.length, listData.length, screenshotData.length);
|
||||
const maxLength = Math.max(
|
||||
schemaData.length,
|
||||
listData.length,
|
||||
screenshotData.length,
|
||||
markdownData.length,
|
||||
htmlData.length,
|
||||
crawlData.length,
|
||||
searchData.length
|
||||
);
|
||||
|
||||
for (let i = 0; i < maxLength; i++) {
|
||||
const record: Record<string, any> = {};
|
||||
@@ -176,6 +252,38 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
|
||||
record.Screenshot = screenshotData[i].url;
|
||||
}
|
||||
|
||||
if (i < markdownData.length) {
|
||||
Object.entries(markdownData[i] || {}).forEach(([key, value]) => {
|
||||
if (value !== null && value !== undefined && value !== "") {
|
||||
record[key] = value;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (i < htmlData.length) {
|
||||
Object.entries(htmlData[i] || {}).forEach(([key, value]) => {
|
||||
if (value !== null && value !== undefined && value !== "") {
|
||||
record[key] = value;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (i < crawlData.length) {
|
||||
Object.entries(crawlData[i] || {}).forEach(([key, value]) => {
|
||||
if (value !== null && value !== undefined && value !== "") {
|
||||
record[key] = value;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (i < searchData.length) {
|
||||
Object.entries(searchData[i] || {}).forEach(([key, value]) => {
|
||||
if (value !== null && value !== undefined && value !== "") {
|
||||
record[key] = value;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (Object.keys(record).length > 0) {
|
||||
allRecords.push(record);
|
||||
}
|
||||
@@ -194,6 +302,18 @@ function mergeRelatedData(serializableOutput: SerializableOutput, binaryOutput:
|
||||
Screenshot: screenshotData[i].url,
|
||||
});
|
||||
}
|
||||
for (let i = maxLength; i < markdownData.length; i++) {
|
||||
allRecords.push(markdownData[i]);
|
||||
}
|
||||
for (let i = maxLength; i < htmlData.length; i++) {
|
||||
allRecords.push(htmlData[i]);
|
||||
}
|
||||
for (let i = maxLength; i < crawlData.length; i++) {
|
||||
allRecords.push(crawlData[i]);
|
||||
}
|
||||
for (let i = maxLength; i < searchData.length; i++) {
|
||||
allRecords.push(searchData[i]);
|
||||
}
|
||||
|
||||
return allRecords;
|
||||
}
|
||||
|
||||
@@ -13,6 +13,10 @@ interface GoogleSheetUpdateTask {
|
||||
interface SerializableOutput {
|
||||
scrapeSchema?: Record<string, any[]>;
|
||||
scrapeList?: Record<string, any[]>;
|
||||
markdown?: Array<{ content: string }>;
|
||||
html?: Array<{ content: string }>;
|
||||
crawl?: Record<string, any[]>;
|
||||
search?: any;
|
||||
}
|
||||
|
||||
|
||||
@@ -95,6 +99,72 @@ export async function updateGoogleSheet(robotId: string, runId: string) {
|
||||
}
|
||||
}
|
||||
|
||||
if (serializableOutput.markdown && Array.isArray(serializableOutput.markdown) && serializableOutput.markdown.length > 0) {
|
||||
const markdownData = serializableOutput.markdown.map((item, index) => ({
|
||||
"Index": index + 1,
|
||||
"Content": item.content || ""
|
||||
}));
|
||||
|
||||
await processOutputType(
|
||||
robotId,
|
||||
spreadsheetId,
|
||||
'Markdown',
|
||||
markdownData,
|
||||
plainRobot
|
||||
);
|
||||
}
|
||||
|
||||
if (serializableOutput.html && Array.isArray(serializableOutput.html) && serializableOutput.html.length > 0) {
|
||||
const htmlData = serializableOutput.html.map((item, index) => ({
|
||||
"Index": index + 1,
|
||||
"Content": item.content || ""
|
||||
}));
|
||||
|
||||
await processOutputType(
|
||||
robotId,
|
||||
spreadsheetId,
|
||||
'HTML',
|
||||
htmlData,
|
||||
plainRobot
|
||||
);
|
||||
}
|
||||
|
||||
if (serializableOutput.crawl && typeof serializableOutput.crawl === "object") {
|
||||
for (const [crawlName, crawlArray] of Object.entries(serializableOutput.crawl)) {
|
||||
if (!Array.isArray(crawlArray) || crawlArray.length === 0) continue;
|
||||
|
||||
await processOutputType(
|
||||
robotId,
|
||||
spreadsheetId,
|
||||
`Crawl - ${crawlName}`,
|
||||
crawlArray,
|
||||
plainRobot
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (serializableOutput.search) {
|
||||
let searchData: any[] = [];
|
||||
|
||||
if (serializableOutput.search.results && Array.isArray(serializableOutput.search.results)) {
|
||||
searchData = serializableOutput.search.results;
|
||||
} else if (Array.isArray(serializableOutput.search)) {
|
||||
searchData = serializableOutput.search;
|
||||
} else {
|
||||
searchData = [serializableOutput.search];
|
||||
}
|
||||
|
||||
if (searchData.length > 0) {
|
||||
await processOutputType(
|
||||
robotId,
|
||||
spreadsheetId,
|
||||
'Search Results',
|
||||
searchData,
|
||||
plainRobot
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (plainRun.binaryOutput && Object.keys(plainRun.binaryOutput).length > 0) {
|
||||
|
||||
@@ -484,6 +484,8 @@ async function executeRun(id: string, userId: string) {
|
||||
const categorizedOutput = {
|
||||
scrapeSchema: finalRun?.serializableOutput?.scrapeSchema || {},
|
||||
scrapeList: finalRun?.serializableOutput?.scrapeList || {},
|
||||
crawl: finalRun?.serializableOutput?.crawl || {},
|
||||
search: finalRun?.serializableOutput?.search || {}
|
||||
};
|
||||
|
||||
await destroyRemoteBrowser(plainRun.browserId, userId);
|
||||
@@ -570,6 +572,8 @@ async function executeRun(id: string, userId: string) {
|
||||
}, {} as Record<string, any[]>)
|
||||
: {},
|
||||
captured_lists: categorizedOutput.scrapeList,
|
||||
crawl_data: categorizedOutput.crawl,
|
||||
search_data: categorizedOutput.search,
|
||||
captured_texts_count: totalSchemaItemsExtracted,
|
||||
captured_lists_count: totalListItemsExtracted,
|
||||
screenshots_count: extractedScreenshotsCount
|
||||
|
||||
Reference in New Issue
Block a user