diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..d7abca37 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,70 @@ +# --- Base Stage --- +FROM node:18 AS base +WORKDIR /app + +# Copy shared package.json and install dependencies +COPY package.json package-lock.json ./ +COPY maxun-core/package.json ./maxun-core/package.json +RUN npm install + +# --- Backend Build Stage --- +FROM base AS backend-build +WORKDIR /app + +# Copy TypeScript configs +COPY tsconfig*.json ./ +COPY server/tsconfig.json ./server/ + +# Copy ALL source code (both frontend and backend) +COPY src ./src +# Copy backend code and maxun-core +COPY server/src ./server/src +COPY maxun-core ./maxun-core + +# Install TypeScript globally and build +RUN npm install -g typescript +RUN npm run build:server + +# --- Frontend Build Stage --- +FROM base AS frontend-build +WORKDIR /app + +# Copy frontend code and configs +COPY src ./src +COPY index.html ./index.html +COPY public ./public +COPY vite.config.js ./ +COPY tsconfig.json ./ + +# Build frontend +RUN npm run build + +# --- Production Stage --- +FROM nginx:alpine AS production + +# Install Node.js in the production image +RUN apk add --update nodejs npm + +# Copy nginx configuration +COPY nginx.conf /etc/nginx/conf.d/default.conf + +# Copy built frontend +COPY --from=frontend-build /app/build /usr/share/nginx/html +COPY --from=frontend-build /app/public/img /usr/share/nginx/html/img + +# Copy built backend and its dependencies +WORKDIR /app +COPY --from=backend-build /app/package*.json ./ +COPY --from=backend-build /app/server/dist ./server/dist +COPY --from=backend-build /app/maxun-core ./maxun-core +COPY --from=backend-build /app/node_modules ./node_modules + +# Copy start script +COPY docker-entrypoint.sh / +RUN chmod +x /docker-entrypoint.sh + +EXPOSE 80 8080 + +# Start both nginx and node server +ENTRYPOINT ["/docker-entrypoint.sh"] + \ No newline at end of file diff --git a/Dockerfile.backend b/Dockerfile.backend deleted file mode 100644 index ec5ca679..00000000 --- a/Dockerfile.backend +++ /dev/null @@ -1,23 +0,0 @@ -# Use node image -FROM node:18-alpine - -# Set working directory in the container to /app -WORKDIR /app - -# Copy only the package.json and package-lock.json first for caching -COPY package.json package-lock.json ./ - -# Install dependencies -RUN npm install --production - -# Copy the entire project (core and backend code) -COPY . . - -# Set the working directory to the backend folder -WORKDIR /app/server - -# Expose the port the backend listens on -EXPOSE 8080 - -# Start the backend server -CMD ["npm", "run", "start:server"] diff --git a/Dockerfile.frontend b/Dockerfile.frontend deleted file mode 100644 index 9fd668ad..00000000 --- a/Dockerfile.frontend +++ /dev/null @@ -1,29 +0,0 @@ -# Use node image for the build stage -FROM node:18-alpine AS build - -# Set working directory in the container to /app -WORKDIR /app - -# Copy only the package.json and package-lock.json first for caching -COPY package.json package-lock.json ./ - -# Install dependencies (legacy peer deps is needed for react highlight, we get rid of it soon) -RUN npm install --legacy-peer-deps - -# Copy the entire project (including frontend code) -COPY . . - -# Build the frontend -RUN npm run build - -# Use NGINX for serving the built frontend in production -FROM nginx:stable-alpine - -# Copy the build output from the previous stage -COPY --from=build /app/build /usr/share/nginx/html - -# Expose the frontend port -EXPOSE 3000 - -# Start NGINX server -CMD ["nginx", "-g", "daemon off;"] diff --git a/README.md b/README.md new file mode 100644 index 00000000..327a26e7 --- /dev/null +++ b/README.md @@ -0,0 +1,118 @@ +

+
+ + +
+ Maxun +
+
+ Open-Source No-Code Web Data Extraction Platform
+

+ +

+Maxun lets you train a robot in 2 minutes and scrape the web on auto-pilot. Web data extraction doesn't get easier than this! +

+ + +

+ Website | + Discord | + Twitter | + Join Maxun Cloud +

+ +![maxun_demo](https://github.com/user-attachments/assets/a61ba670-e56a-4ae1-9681-0b4bd6ba9cdc) + + + +# Installation +### Docker +⚠️ Work In Progress. Will be available by EOD. + +### Local Setup +1. Ensure you have Node.js, PostgreSQL, MinIO and Redis installed on your system. +2. Run the commands below: +``` +git clone https://github.com/getmaxun/maxun + +# change directory to the project root +cd maxun + +# install dependencies +npm install + +# change directory to maxun-core to install dependencies +cd maxun-core +npm install + +# start frontend and backend together +npm run start +``` +You can access the frontend at http://localhost:5173/ and backend at http://localhost:8080/ + + +# Envirnoment Variables +| Variable | Mandatory | Description | If Not Set | +|-----------------------|-----------|----------------------------------------------------------------------------------------------|--------------------------------------------------------------| +| `NODE_ENV` | Yes | Defines the app environment (`development`, `production`). | Defaults to `development`. | +| `JWT_SECRET` | Yes | Secret key used to sign and verify JSON Web Tokens (JWTs) for authentication. | JWT authentication will not work. | +| `DB_NAME` | Yes | Name of the Postgres database to connect to. | Database connection will fail. | +| `DB_USER` | Yes | Username for Postgres database authentication. | Database connection will fail. | +| `DB_PASSWORD` | Yes | Password for Postgres database authentication. | Database connection will fail. | +| `DB_HOST` | Yes | Host address where the Postgres database server is running. | Database connection will fail. | +| `DB_PORT` | Yes | Port number used to connect to the Postgres database server. | Database connection will fail. | +| `ENCRYPTION_KEY` | Yes | Key used for encrypting sensitive data (proxies, passwords). | Encryption functionality will not work. | +| `MINIO_ENDPOINT` | Yes | Endpoint URL for MinIO, to store Robot Run Screenshots. | Connection to MinIO storage will fail. | +| `MINIO_PORT` | Yes | Port number for MinIO service. | Connection to MinIO storage will fail. | +| `MINIO_ACCESS_KEY` | Yes | Access key for authenticating with MinIO. | MinIO authentication will fail. | +| `GOOGLE_CLIENT_ID` | No | Client ID for Google OAuth, used for Google Sheet integration authentication. | Google login will not work. | +| `GOOGLE_CLIENT_SECRET`| No | Client Secret for Google OAuth. | Google login will not work. | +| `GOOGLE_REDIRECT_URI` | No | Redirect URI for handling Google OAuth responses. | Google login will not work. | +| `REDIS_HOST` | Yes | Host address of the Redis server, used by BullMQ for scheduling robots. | Redis connection will fail. | +| `REDIS_PORT` | Yes | Port number for the Redis server. | Redis connection will fail. | +| `MAXUN_TELEMETRY` | No | Disables telemetry to stop sending anonymous usage data. Keeping it enabled helps us understand how the product is used and assess the impact of any new changes. Please keep it enabled. | Telemetry data will not be collected. | + + + +# How Does It Work? +Maxun lets you create custom robots which emulate user actions and extract data. A robot can perform any of the actions: Capture List, Capture Text or Capture Screenshot. Once a robot is created, it will keep extracting data for you without manual intervention + +![Screenshot 2024-10-23 222138](https://github.com/user-attachments/assets/53573c98-769e-490d-829e-ada9fac0764f) + +## 1. Robot Actions +1. Capture List: Useful to extract structured and bulk items from the website. Example: Scrape products from Amazon etc. +2. Capture Text: Useful to extract individual text content from the website. +3. Capture Screenshot: Get fullpage or visible section screenshots of the website. + +## 2. BYOP +BYOP (Bring Your Own Proxy) lets you connect external proxies to bypass anti-bot protection. Currently, the proxies are per user. Soon you'll be able to configure proxy per robot. + + +# Features +- ✨ Extract Data With No-Code +- ✨ Handle Pagination & Scrolling +- ✨ Run Robots On A Specific Schedule +- ✨ Turn Websites to APIs +- ✨ Turn Websites to Spreadsheets +- ✨ Adapt To Website Layout Changes (coming soon) +- ✨ Extract Behind Login, With Two-Factor Authentication Support (coming soon) +- ✨ Integrations (currently Google Sheet) +- +++ A lot of amazing things soon! + +# Cloud +We offer a managed cloud version to run Maxun without having to manage the infrastructure and extract data at scale. Maxun cloud also deals with anti-bot detection, huge proxy network with automatic proxy rotation, and CAPTCHA solving. If this interests you, [join the cloud waitlist](https://docs.google.com/forms/d/e/1FAIpQLSdbD2uhqC4sbg4eLZ9qrFbyrfkXZ2XsI6dQ0USRCQNZNn5pzg/viewform) as we launch soon. + +# Note +This project is in early stages of development. Your feedback is very important for us - we're actively working to improve the product. Drop anonymous feedback here. + +# License +

+This project is licensed under AGPLv3. +

+ +# Contributors +Thank you to the combined efforts of everyone who contributes! + + + + diff --git a/docker-compose.yml b/docker-compose.yml index 1019d486..f36e8900 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,67 +1,53 @@ version: '3.8' + services: - # Frontend - frontend: + app: build: context: . - dockerfile: ./Dockerfile.frontend + dockerfile: Dockerfile + target: production + env_file: .env ports: - - "3000:3000" # Map host port 3000 to container port 3000 + - "5173:80" + - "8080:8080" depends_on: - - backend - networks: - - app-network - - # Backend - backend: - build: - context: . - dockerfile: ./Dockerfile.backend - ports: - - "8080:8080" # Map host port 8080 to container port 8080 - environment: - POSTGRES_HOST: postgres - POSTGRES_DB: mydb - POSTGRES_USER: myuser - POSTGRES_PASSWORD: mypassword - MINIO_ENDPOINT: minio - MINIO_PORT: 9000 - depends_on: - - postgres + - db - minio - networks: - - app-network + - redis - # Postgres Database - postgres: - image: postgres:15 + db: + image: postgres:13 environment: - POSTGRES_DB: mydb - POSTGRES_USER: myuser - POSTGRES_PASSWORD: mypassword + POSTGRES_DB: ${DB_NAME} + POSTGRES_USER: ${DB_USER} + POSTGRES_PASSWORD: ${DB_PASSWORD} + ports: + - "5432:5432" volumes: - postgres_data:/var/lib/postgresql/data - networks: - - app-network - # MinIO for Storage minio: image: minio/minio + environment: + MINIO_ROOT_USER: ${MINIO_ACCESS_KEY} + MINIO_ROOT_PASSWORD: ${MINIO_SECRET_KEY} command: server /data ports: - "9000:9000" - environment: - MINIO_ROOT_USER: minioadmin - MINIO_ROOT_PASSWORD: minioadmin123 volumes: - minio_data:/data - networks: - - app-network + + redis: + image: redis:6 + environment: + - REDIS_HOST=redis + - REDIS_PORT=6379 + ports: + - "6379:6379" + volumes: + - redis_data:/data volumes: postgres_data: minio_data: - -networks: - app-network: - driver: bridge + redis_data: diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh new file mode 100644 index 00000000..7e36eed4 --- /dev/null +++ b/docker-entrypoint.sh @@ -0,0 +1,7 @@ +#!/bin/sh + +# Start backend server +cd /app && npm run start:server & + +# Start nginx +nginx -g 'daemon off;' \ No newline at end of file diff --git a/index.html b/index.html index c7579580..8a52962b 100644 --- a/index.html +++ b/index.html @@ -8,8 +8,7 @@ name="description" content="Web site created using Vite" /> - - + Maxun | Open Source No Code Web Data Extraction Platform diff --git a/maxun-core/package.json b/maxun-core/package.json index 87e8b93d..45c69ffe 100644 --- a/maxun-core/package.json +++ b/maxun-core/package.json @@ -1,6 +1,6 @@ { "name": "maxun-core", - "version": "0.0.2", + "version": "0.0.3", "description": "Core package for Maxun, responsible for data extraction", "main": "build/index.js", "typings": "build/index.d.ts", diff --git a/nginx.conf b/nginx.conf new file mode 100644 index 00000000..e9d636f8 --- /dev/null +++ b/nginx.conf @@ -0,0 +1,17 @@ +server { + listen 80; + + location / { + root /usr/share/nginx/html; + try_files $uri $uri/ /index.html; + } + + location /api { + proxy_pass http://127.0.0.1:8080; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection 'upgrade'; + proxy_set_header Host $host; + proxy_cache_bypass $http_upgrade; + } +} \ No newline at end of file diff --git a/package.json b/package.json index 70b94311..343270b8 100644 --- a/package.json +++ b/package.json @@ -41,6 +41,7 @@ "jsonwebtoken": "^9.0.2", "loglevel": "^1.8.0", "loglevel-plugin-remote": "^0.6.8", + "maxun-core": "^0.0.3", "minio": "^8.0.1", "moment-timezone": "^0.5.45", "node-cron": "^3.0.3", @@ -75,7 +76,9 @@ "server": "./node_modules/.bin/nodemon server/src/server.ts", "client": "vite", "build": "vite build", - "test": "vite preview", + "build:server": "tsc -p server/tsconfig.json", + "start:server": "node server/dist/server/src/server.js", + "preview": "vite preview", "lint": "./node_modules/.bin/eslint ." }, "eslintConfig": { diff --git a/server/.gitignore b/server/.gitignore new file mode 100644 index 00000000..0d78c503 --- /dev/null +++ b/server/.gitignore @@ -0,0 +1,20 @@ +# dependencies +/node_modules + +# misc +.DS_Store +.env.local +.env.development.local +.env.test.local +.env.production.local +.env + +/.idea + +/server/logs + +/build + +/dist + +package-lock.json \ No newline at end of file diff --git a/server/src/api/record.ts b/server/src/api/record.ts index e4dca279..7710f075 100644 --- a/server/src/api/record.ts +++ b/server/src/api/record.ts @@ -298,12 +298,14 @@ router.get("/robots/:id/runs", requireAPIKey, async (req: Request, res: Response raw: true }); + const formattedRuns = runs.map(formatRunResponse); + const response = { statusCode: 200, messageCode: "success", runs: { - totalCount: runs.length, - items: runs, + totalCount: formattedRuns.length, + items: formattedRuns, }, }; @@ -319,6 +321,32 @@ router.get("/robots/:id/runs", requireAPIKey, async (req: Request, res: Response } ); +function formatRunResponse(run: any) { + const formattedRun = { + id: run.id, + status: run.status, + name: run.name, + robotId: run.robotMetaId, // Renaming robotMetaId to robotId + startedAt: run.startedAt, + finishedAt: run.finishedAt, + runId: run.runId, + runByUserId: run.runByUserId, + runByScheduleId: run.runByScheduleId, + runByAPI: run.runByAPI, + data: {}, + screenshot: null, + }; + + if (run.serializableOutput && run.serializableOutput['item-0']) { + formattedRun.data = run.serializableOutput['item-0']; + } else if (run.binaryOutput && run.binaryOutput['item-0']) { + formattedRun.screenshot = run.binaryOutput['item-0']; + } + + return formattedRun; +} + + /** * @swagger * /api/robots/{id}/runs/{runId}: @@ -393,7 +421,7 @@ router.get("/robots/:id/runs/:runId", requireAPIKey, async (req: Request, res: R const response = { statusCode: 200, messageCode: "success", - run: run, + run: formatRunResponse(run), }; res.status(200).json(response); @@ -754,7 +782,7 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest, const response = { statusCode: 200, messageCode: "success", - run: completedRun, + run: formatRunResponse(completedRun), }; res.status(200).json(response); diff --git a/server/src/constants/config.ts b/server/src/constants/config.ts index afc77031..74d9de4c 100644 --- a/server/src/constants/config.ts +++ b/server/src/constants/config.ts @@ -1,4 +1,4 @@ export const SERVER_PORT = process.env.SERVER_PORT ? Number(process.env.SERVER_PORT) : 8080 export const DEBUG = process.env.DEBUG === 'true' export const LOGS_PATH = process.env.LOGS_PATH ?? 'server/logs' -export const ANALYTICS_ID = process.env.ANALYTICS_ID ?? 'oss' \ No newline at end of file +export const ANALYTICS_ID = 'oss' \ No newline at end of file diff --git a/server/src/server.ts b/server/src/server.ts index dd824004..5c7fc898 100644 --- a/server/src/server.ts +++ b/server/src/server.ts @@ -62,8 +62,13 @@ readdirSync(path.join(__dirname, 'api')).forEach((r) => { } }); -const workerProcess = fork(path.resolve(__dirname, './worker.ts'), [], { - execArgv: ['--inspect=5859'], // Specify a different debug port for the worker +// Check if we're running in production or development +const isProduction = process.env.NODE_ENV === 'production'; +const workerPath = path.resolve(__dirname, isProduction ? './worker.js' : '/worker.ts'); + +// Fork the worker process +const workerProcess = fork(workerPath, [], { + execArgv: isProduction ? ['--inspect=8081'] : ['--inspect=5859'], }); workerProcess.on('message', (message) => { diff --git a/server/src/storage/db.ts b/server/src/storage/db.ts index 56c68d8b..6a23ef42 100644 --- a/server/src/storage/db.ts +++ b/server/src/storage/db.ts @@ -6,7 +6,7 @@ dotenv.config(); const sequelize = new Sequelize( `postgresql://${process.env.DB_USER}:${process.env.DB_PASSWORD}@${process.env.DB_HOST}:${process.env.DB_PORT}/${process.env.DB_NAME}`, { - host: 'localhost', + host: process.env.DB_HOST, dialect: 'postgres', logging: false, } diff --git a/server/src/storage/mino.ts b/server/src/storage/mino.ts index 96e3d0c2..3b83e386 100644 --- a/server/src/storage/mino.ts +++ b/server/src/storage/mino.ts @@ -21,6 +21,38 @@ minioClient.bucketExists('maxun-test') console.error('Error connecting to MinIO:', err); }) +async function createBucketWithPolicy(bucketName: string, policy?: 'public-read' | 'private') { + try { + const bucketExists = await minioClient.bucketExists(bucketName); + if (!bucketExists) { + await minioClient.makeBucket(bucketName); + console.log(`Bucket ${bucketName} created successfully.`); + + if (policy === 'public-read') { + // Define a public-read policy + const policyJSON = { + Version: "2012-10-17", + Statement: [ + { + Effect: "Allow", + Principal: "", + Action: ["s3:GetObject"], + Resource: [`arn:aws:s3:::${bucketName}/*`] + } + ] + }; + await minioClient.setBucketPolicy(bucketName, JSON.stringify(policyJSON)); + console.log(`Public-read policy applied to bucket ${bucketName}.`); + } + } else { + console.log(`Bucket ${bucketName} already exists.`); + } + } catch (error) { + console.error('Error in bucket creation or policy application:', error); + } +} + + class BinaryOutputService { private bucketName: string; @@ -98,6 +130,7 @@ class BinaryOutputService { } async uploadBinaryOutputToMinioBucket(run: Run, key: string, data: Buffer): Promise { + await createBucketWithPolicy('maxun-run-screenshots', 'public-read'); const bucketName = 'maxun-run-screenshots'; try { console.log(`Uploading to bucket ${bucketName} with key ${key}`); diff --git a/server/src/utils/auth.ts b/server/src/utils/auth.ts index b1f6850f..e73a4237 100644 --- a/server/src/utils/auth.ts +++ b/server/src/utils/auth.ts @@ -24,9 +24,9 @@ export const comparePassword = (password: string, hash: string): Promise { - const ivLength = parseInt(getEnvVariable('IV_LENGTH'), 10); + const ivLength = 16; const iv = crypto.randomBytes(ivLength); - const algorithm = getEnvVariable('ALGORITHM'); + const algorithm = 'aes-256-cbc'; const key = Buffer.from(getEnvVariable('ENCRYPTION_KEY'), 'hex'); const cipher = crypto.createCipheriv(algorithm, key, iv); let encrypted = cipher.update(text, 'utf8', 'hex'); diff --git a/server/src/worker.ts b/server/src/worker.ts index 7bbf52af..fd3470d4 100644 --- a/server/src/worker.ts +++ b/server/src/worker.ts @@ -5,9 +5,14 @@ import { handleRunRecording } from "./workflow-management/scheduler"; import Robot from './models/Robot'; import { computeNextRun } from './utils/schedule'; +console.log('Environment variables:', { + REDIS_HOST: process.env.REDIS_HOST, + REDIS_PORT: process.env.REDIS_PORT, +}); + const connection = new IORedis({ - host: 'localhost', - port: 6379, + host: process.env.REDIS_HOST, + port: process.env.REDIS_PORT ? parseInt(process.env.REDIS_PORT, 10) : 6379, maxRetriesPerRequest: null, }); diff --git a/server/tsconfig.json b/server/tsconfig.json new file mode 100644 index 00000000..820e903e --- /dev/null +++ b/server/tsconfig.json @@ -0,0 +1,32 @@ +{ + "compilerOptions": { + "target": "es2018", + "module": "commonjs", + "outDir": "./dist", + "rootDir": "../", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "moduleResolution": "node", + "baseUrl": "../", + "paths": { + "*": ["*"], + "src/*": ["src/*"] + }, + "jsx": "react-jsx", + "lib": ["dom", "dom.iterable", "esnext"], + "allowJs": true + }, + "include": [ + "src/**/*", + "../src/shared/**/*", + "../src/helpers/**/*" + ], + "exclude": [ + "node_modules", + "../src/components/**/*", // Exclude frontend components + "../src/pages/**/*", // Exclude frontend pages + "../src/app/**/*" // Exclude other frontend-specific code + ] +} diff --git a/src/components/molecules/ActionDescriptionBox.tsx b/src/components/molecules/ActionDescriptionBox.tsx index 23f8c1ed..4efdb32e 100644 --- a/src/components/molecules/ActionDescriptionBox.tsx +++ b/src/components/molecules/ActionDescriptionBox.tsx @@ -110,7 +110,7 @@ const ActionDescriptionBox = () => { return ( - + {renderActionDescription()} diff --git a/src/components/molecules/NavBar.tsx b/src/components/molecules/NavBar.tsx index 89b005af..b0a409b1 100644 --- a/src/components/molecules/NavBar.tsx +++ b/src/components/molecules/NavBar.tsx @@ -54,7 +54,7 @@ export const NavBar: React.FC = ({ recordingName, isRecording }) => display: 'flex', justifyContent: 'flex-start', }}> - +
Maxun
{ diff --git a/vite.config.js b/vite.config.js index 4e690eb8..aab999e4 100644 --- a/vite.config.js +++ b/vite.config.js @@ -5,6 +5,8 @@ export default defineConfig(() => { return { build: { outDir: 'build', + manifest: true, + chunkSizeWarningLimit: 1024, }, plugins: [react()], };