Makhdoom's Portfolio

Timeline

Project started

2024-07-06

Google Sheets Monitor Bot completed

2024-07-18

Category Crawler Bot completed

2024-07-23

URL Checker Bot completed

2024-07-30

Detailed Description

Automated Web Scraping and Data Collection

The "Node 🤖🤖🤖 Bots" project leverages Node.js along with libraries like Puppeteer to automate web scraping, providing an efficient way to collect data from various sources. This system can be tailored to analyze websites, gather content, and perform actions without user intervention. Whether you need to monitor changes in websites, collect critical business data, or extract information for further processing, this bot-driven solution ensures accuracy and speed.

Advanced Monitoring with Dashboard Integration

Incorporating real-time monitoring and performance tracking, the Node Bots project integrates an intuitive dashboard where users can monitor bot activities. With features like status updates, error logs, and efficiency reports, users have full control and visibility over their web scraping and automation tasks. The dashboard not only enhances productivity but also provides insights into how effectively the bots are performing in real-time.

Customizable Automation for Targeted Tasks

The Node Bots system is highly customizable, allowing developers to tailor the bots to specific automation tasks. From checking the health of websites to conducting in-depth SEO analysis, the flexibility of Node.js ensures that these bots can adapt to a variety of use cases. Whether your focus is on efficiency, error reduction, or detailed data extraction, this project makes it possible to streamline workflows and automate tedious manual tasks.


async function setupBrowser(proxy, botId) {
    const browser = await puppeteer.launch({
        headless: process.env.HEADLESS === 'true',
        args: [
            `--proxy-server=${proxy.server}:${proxy.port}`,
            '--no-sandbox',
            '--disable-setuid-sandbox',
            '--disable-gpu',
            '--disable-dev-shm-usage',
            '--ignore-certificate-errors',
        ],
    });

    const page = await browser.newPage();

    await page.authenticate({
        username: proxy.username,
        password: proxy.password
    });

    const { userAgent, platform, osInfo } = generateUniqueUserAgent(botId);
    await page.setUserAgent(userAgent);

    await page.evaluateOnNewDocument((userAgent, platform, osInfo) => {
        Object.defineProperty(navigator, 'platform', { value: platform });
        Object.defineProperty(navigator, 'userAgent', { value: userAgent });
        Object.defineProperty(navigator, 'appVersion', { value: osInfo });
        window.outerWidth = 1920;
        window.outerHeight = 1080;
        window.chrome = { runtime: {} };
        const originalRTCPeerConnection = window.RTCPeerConnection;
        window.RTCPeerConnection = function (...args) {
            const pc = new originalRTCPeerConnection(...args);
            pc.createDataChannel = function () { return {}; };
            return pc;
        };
    }, userAgent, platform, osInfo);

    await page.setRequestInterception(true);

    page.on('request', (request) => {
        const url = request.url();
        const resourceType = request.resourceType();

        // Modify headers for all requests
        const headers = request.headers();
        headers['sec-ch-ua-platform'] = `"${platform}"`;
        headers['user-agent'] = userAgent;

        // Block specific resource types
        if (['stylesheet', 'image', 'font', 'media'].includes(resourceType)) {
            request.abort();
        } else {
            request.continue({ headers });
        }
    });

    return { browser, page };
}

Category Crawler Bot

This bot crawls the website browsenodes.com to gather categories and sub-categories, generating a structured JSON file representing the category tree.


async function buildTree(page, url, depth = 0, path = []) {
    try {
        const nodes = await scrapeNode(page, url);
        const newNodes = [];

        for (const node of nodes) {
            if (node.leafNode || processedNodes.has(node.id)) continue;

            processedNodes.add(node.id);

            let newNode = { ...node, children: [] };

            const currentPath = [...path, node.id];
            console.log(`Processing on node: ${currentPath.join(' -> ')} -> ${node.id}`);

            if (node.childUrl) {
                const childUrl = new URL(node.childUrl, url).href;
                newNode.children = await buildTree(page, childUrl, depth + 1, currentPath);
            }

            newNodes.push(newNode);
        }

        // Merge newNodes with currentTree
        for (const newNode of newNodes) {
            const existingNodeIndex = currentTree.findIndex(n => n.id === newNode.id);
            if (existingNodeIndex !== -1) {
                currentTree[existingNodeIndex] = { ...currentTree[existingNodeIndex], children: newNode.children };
            } else {
                currentTree.push(newNode);
            }
        }

        // Write the updated tree to the file after processing all nodes at this level
        await writeTree(OUTPUT_FILE, currentTree);
    } catch (error) {
        console.error(`Error scraping ${url}: ${error.message}`);
        return currentTree;
    }
}

URL Checker Bot

URL Checker Bot: This bot verifies the functionality of URLs by visiting each one to ensure they are working correctly.


async function checkUrlHtmlInIframe(url) {
    const page = await browser.newPage();
    let headers = {};
    try {
        const timeout = 15000; // 15 seconds timeout

        // Disable CSS, images, and other media
        await page.setRequestInterception(true);
        page.on('request', (request) => {
            if (['stylesheet', 'image', 'media', 'font'].includes(request.resourceType())) {
                request.abort();
            } else {
                request.continue();
            }
        });

        // Collect headers
        page.on('response', response => {
            headers = response.headers();
        });

        await page.goto(url, { waitUntil: 'domcontentloaded', timeout });
        const content = await page.content();
        const isWorking = content.trim().length > 0;

        await page.close();
        return { isWorking, headers };
    } catch (error) {
        if (error.name === 'TimeoutError') {
            console.log(`Timeout checking ${url}`);
            await page.close();
            return { isWorking: false, headers, error: 'Timeout' };
        }
        console.error(`Error checking ${url}:`, error.message);
        await page.close();
        return { isWorking: false, headers, error: error.message };
    }
}

Gallery