50-Course · June 19, 2025 06:29 · Jun 19, 2025
diff --git a/async_scrapper.py b/async_scrapper.py
@@ -0,0 +1,449 @@
+import asyncio
+import logging
+import os
+import random
+import time
+from contextlib import asynccontextmanager
+from datetime import datetime
+from pathlib import Path
+from typing import Annotated, Any, Callable, Dict, List, Optional
+
+from bs4 import BeautifulSoup
+from openpyxl import Workbook
+from openpyxl.utils import get_column_letter
+from playwright.async_api import Browser, BrowserContext
+from playwright.async_api import Error as PlaywrightError
+from playwright.async_api import Locator, Page
+from playwright.async_api import TimeoutError as PlaywrightTimeoutError
+from playwright.async_api import async_playwright
+from playwright_stealth import stealth_async
+
+from src.scrapper.scrape_product_data_async import extract_product_data_async
+from src.scrapper.scrape_product_tiles_async import \
+    scrape_product_overview_tiles
+
+from .constants import (SELECTOR_CATEGORY_ITEM, SELECTOR_CATEGORY_LABEL,
+                        SELECTOR_CATEGORY_LABEL_SAFE,
+                        SELECTOR_HOMEPAGE_PRODUCTS_COLUMN,
+                        SELECTOR_INDEX_ENTRY_IMAGE, SELECTOR_INDEX_ENTRY_ITEM,
+                        SELECTOR_INDEX_ENTRY_LINK, SELECTOR_INDEX_ENTRY_TITLE,
+                        SELECTOR_INDEX_LIST_CONTAINER,
+                        SELECTOR_INDEX_PAGE_HEADER,
+                        SELECTOR_PRODUCTS_INNERMOST_CONTAINER,
+                        SELECTOR_SUBCATEGORY_LINK,
+                        SELECTOR_SUBCATEGORY_LINK_SAFE)
+from .constants import _ResponseData as Response
+from .utils import (browser_context, extract_product_link_from_tile,
+                    fallback_locator, get_random_user_agent, goto_with_retry,
+                    human_delay, is_valid_product_page, retry_with_backoff,
+                    write_category_to_excel)
+
+logger = logging.getLogger(__name__)
+
+url: str = "https://www.medicalexpo.com/"
+
+
+async def scrape_url(
+    url: str,
+    headless: bool = False,
+    debug: bool = False,
+    slow_mo: int = 40,
+    wait_for_load: int = 3000,
+    to_excel: bool = False,
+    output_dir: Path | None = None,
+    send_notification: bool = False,
+) -> None:
+    try:
+        async with browser_context(
+            headless=headless,
+            user_agent=get_random_user_agent(),
+            bypass_csp=True,
+        ) as ctx:
+            page = await ctx.new_page()
+
+            await stealth_async(page)
+
+            await retry_with_backoff(lambda: page.goto(url, wait_until="networkidle"))
+            if debug and wait_for_load > 0:
+                await page.wait_for_timeout(wait_for_load)
+
+            print("Checking for page response")
+
+            parent_container_visble = await page.is_visible(
+                SELECTOR_HOMEPAGE_PRODUCTS_COLUMN
+            )
+            if parent_container_visble:
+                print("[INFO] Parent Container is Visible")
+                await entrypoint(page, to_excel=to_excel)
+
+    except (PlaywrightError, TimeoutError) as play_err:
+        logger.exception(f"Error scraping URL: {play_err}")
+
+
+async def scrape_all_subcategory_indexes(ctx: BrowserContext, categories):
+    sem = asyncio.Semaphore(8)
+    jobs = []
+
+    for section in categories:
+        for sub in section["subcategories"]:
+
+            async def scrape_subcategory(name=sub["name"], url=sub["url"], storage=sub):
+                async with sem:
+                    page = await ctx.new_page()
+                    try:
+                        await scrape_product_listing_index(
+                            page, name, url, storage_=storage
+                        )
+                    except Exception as e:
+                        print(f"[ERROR] Failed scraping {name}: {e}")
+                    finally:
+                        await page.close()
+
+            jobs.append(scrape_subcategory())
+
+    await asyncio.gather(*jobs)
+
+
+async def scrape_product_listing_index(
+    page: Page,
+    subcategory_name: str,
+    subcategory_url: str,
+    storage_: Optional[Response] = None,
+) -> None:
+    print(f"[INFO] Navigating to subcategory page: {subcategory_url}")
+    await retry_with_backoff(lambda: page.goto(subcategory_url))
+    await page.wait_for_selector(SELECTOR_INDEX_PAGE_HEADER)
+
+    page_heading = await (
+        await page.query_selector(SELECTOR_INDEX_PAGE_HEADER)
+    ).inner_text()
+    if page_heading.lower() != subcategory_name.lower():
+        print(
+            f"[WARN] Page mismatch: Expected '{subcategory_name}', got '{page_heading}'"
+        )
+        return
+
+    # Wait for parent container
+    await page.wait_for_selector("div#category-group ul.category-grouplist")
+    group_nodes = await page.query_selector_all(
+        "div#category-group ul.category-grouplist"
+    )
+
+    index_entries = []
+
+    for group in group_nodes:
+        item_nodes = await group.query_selector_all("li")
+        for item in item_nodes:
+            a_tag = await item.query_selector("a")
+            img_tag = await item.query_selector("div.imgSubCat img")
+
+            if not a_tag:
+                continue
+
+            name = (await a_tag.inner_text()).strip()
+            href = await a_tag.get_attribute("href")
+            img_src = await img_tag.get_attribute("src") if img_tag else ""
+            img_alt = await img_tag.get_attribute("alt") if img_tag else ""
+
+            index_entries.append(
+                {
+                    "title": name,
+                    "href": href,
+                    "image_meta": {
+                        "src": img_src,
+                        "alt": img_alt,
+                    },
+                }
+            )
+
+    print(index_entries)
+
+    if storage_ is not None:
+        storage_["index_entries"] = index_entries
+
+    print(
+        f"[INFO] Extracted {len(index_entries)} index entries from '{subcategory_name}'"
+    )
+
+
+async def extract_categories(
+    page: Page, logger_func: Optional[Callable[[str], None]] = None
+):
+    logger_func = logger_func or print
+
+    logger_func("[*] Looking for top-level category items...")
+    section_items = await fallback_locator(
+        page,
+        [
+            "li[data-cy^='universGroupItemCy_']",
+            SELECTOR_CATEGORY_ITEM,
+        ],
+    )
+    section_items = await section_items.all()
+
+    logger_func(f"[+] Found {len(section_items)} top-level category items")
+
+    categories: List[Dict[str, Any]] = []
+
+    for i, section in enumerate(section_items):
+        logger_func(f"\n[→] Processing category index {i}")
+
+        try:
+            label_node = await fallback_locator(
+                page,
+                scope=section,
+                selectors=[
+                    ":scope span[class*='UniverseGroupLabel']",
+                    ":scope span[class*='universeGroup__UniverseGroupLabel']",
+                    ":scope span",
+                ],
+            )
+            print(f"[INFO] {label_node}")
+            category_name = (await label_node.inner_text()).strip()
+            logger_func(f"    [✓] Category name: '{category_name}'")
+        except Exception as e:
+            logger_func(f"    [!] Failed to extract category name: {e}")
+            continue
+
+        # expand dropdown
+        try:
+            # wait 5 secs
+            await section.wait_for(timeout=5000)
+            await section.click(timeout=2000)
+            await human_delay(0.2)
+            logger_func("    [✓] Clicked to expand dropdown")
+        except Exception as e:
+            logger_func(f"    [!] Failed to expand category '{category_name}': {e}")
+
+        subsections = await section.locator("ul li a").all()
+        logger_func(f"[→] Section: {category_name} ({len(subsections)} subcategories)")
+
+        subcategories = []
+        for subsection in subsections:
+            try:
+                name = (await subsection.inner_text()).strip()
+                href = await subsection.get_attribute("href")
+                if name and href:
+                    subcategories.append({"name": name, "url": href})
+                    logger_func(f"    [✓] Subsection: {name}")
+            except Exception as e:
+                logger_func(f"    [!] Failed to extract subsection link: {e}")
+
+        categories.append(
+            {
+                "section": category_name,
+                "subcategories": subcategories,
+            }
+        )
+
+        logger_func(
+            f"[→] Completed Section: {category_name} ({len(subsections)} subcategories)"
+        )
+
+    logger_func("\n[✓] Completed extracting all categories.")
+    return categories
+
+
+async def extract_categories_from_homepage(
+    page: Page, storage_: Optional[Response] = None
+):
+    print("[INFO] Entered inside the function: extract_categories_from_homepage")
+
+    try:
+        await page.wait_for_selector(
+            SELECTOR_PRODUCTS_INNERMOST_CONTAINER, state="attached", timeout=15000
+        )
+        print("[INFO] Selector attached to DOM")
+        container = page.locator(SELECTOR_PRODUCTS_INNERMOST_CONTAINER)
+        is_visible = await container.is_visible()
+
+        print(f"[INFO] Container visibility: {is_visible}")
+
+        if not is_visible:
+            print("[INFO] Element is attached but not visible")
+            return
+
+        print("[INFO] Element is attached AND visible. Proceeding.")
+    except (PlaywrightTimeoutError, Exception):
+        print("[ERROR] Innermost container never appeared in DOM")
+        return
+
+    try:
+        categories = await extract_categories(page)
+    except Exception as e:
+        print(f"[ERROR] Failed to extract categories: {e}")
+        return
+
+    if storage_:
+        storage_["categories"] = categories
+
+    print(f"[INFO] Extracted {len(categories)} top-level sections.")
+    return categories
+
+
+async def scrape_product_overview(
+    ctx: BrowserContext,
+    categories: List[Dict[str, Any]],
+    logger_func: Optional[Callable] = None,
+):
+    logger_func = logger_func or print
+
+    sem = asyncio.Semaphore(5)
+
+    entries_to_scrape = [
+        entry
+        for section in categories
+        for sub in section.get("subcategories", [])
+        for entry in sub.get("index_entries", [])
+    ]
+
+    async def scrape_entry(entry):
+        async with sem:
+            page = await ctx.new_page()
+            try:
+                print(f"[->] Visiting product tile index page: {entry.get('href')}")
+                await page.goto(
+                    entry["href"], timeout=60000, wait_until="domcontentloaded"
+                )
+
+                # operation 3: scrape all product tiles in this entry
+                tile_data = await scrape_product_overview_tiles(page)
+
+                # operation 4: for each product tile link, visit and extract full product data
+                full_product_details = []
+                for tile in tile_data:
+                    product_url = tile.get("product_link")
+                    if not product_url:
+                        continue
+
+                    try:
+                        print(f"[->->] Visiting product link: {product_url}")
+                        await page.goto(
+                            product_url, timeout=60000, wait_until="domcontentloaded"
+                        )
+
+                        # I have just discovered some product link causes redirect breaking
+                        # our `extract_product_data_async` logic
+                        # if not await is_valid_product_page(
+                        #     page, logger_func=logger_func
+                        # ):
+                        #     logger_func(
+                        #         f"[WARN] Product page appears to be invalid, removed or moved permanently: {product_url}"
+                        #     )
+                        #     logger_func(
+                        #         f"[SKIP] Soft 404 or placeholder page: {product_url}"
+                        #     )
+                        #     continue
+
+                        full_data = await extract_product_data_async(page)
+                        full_product_details.append({**tile, **full_data})
+                    except Exception as e:
+                        print(
+                            f"[WARN] Failed to extract full product at {product_url}: {e}"
+                        )
+                        continue
+
+                entry["products"] = full_product_details
+                print(f"[✓] Completed scraping for index entry: {entry.get('title')}")
+
+            except Exception as e:
+                print(
+                    f"[WARN] Could not scrape product detail for {entry.get('href')}: {e}"
+                )
+            finally:
+                await page.close()
+
+    await asyncio.gather(*(scrape_entry(entry) for entry in entries_to_scrape))
+    print("[INFO] Completed all tile + full product detail extractions.")
+
+
+async def entrypoint(page: Page, to_excel=False) -> None:
+    print("[INFO] Attempting to perform scrapping...")
+    scraped_data: Response = {}
+
+    # OPERATION 1
+    categories = await extract_categories_from_homepage(page)
+
+    if categories:
+        scraped_data["categories"] = categories
+
+    print("[INFO] Completed Extract")
+
+    # Operation 2
+    await scrape_all_subcategory_indexes(page.context, scraped_data["categories"])
+
+    # OPERATION 3 + 4
+    await scrape_product_overview(page.context, scraped_data["categories"])
+
+    # print(f"[INFO] {scraped_data}")
+    print("[INFO] Successfully scraped website")
+
+    if to_excel and "categories" in scraped_data:
+        print("[DEBUG] Writing extracted categories to Excel file...")
+        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        write_category_to_excel(
+            scraped_data["categories"], filename=f"scraped_expo_data_{timestamp}.xlsx"
+        )
+
+
+if __name__ == "__main__":
+    import argparse
+    import asyncio
+
+    parser = argparse.ArgumentParser(description="MedicalExpo Product Scraper")
+
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="https://www.medicalexpo.com/",
+        help="Target URL to scrape from.",
+    )
+
+    parser.add_argument(
+        "--headless",
+        action="store_true",
+        help="Run browser in headless mode.",
+    )
+
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Debug Mode",
+    )
+
+    parser.add_argument(
+        "--slow-mo",
+        type=int,
+        default=40,
+        help="Slow motion delay in ms between browser actions (default: 40).",
+    )
+
+    parser.add_argument(
+        "--wait-for-load",
+        type=int,
+        default=3000,
+        help="Wait time in ms after initial page load (default: 3000).",
+    )
+
+    parser.add_argument(
+        "--to-excel",
+        action="store_true",
+        help="Whether to write the result to Excel.",
+    )
+
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=None,
+        help="Path to directory for saving output files.",
+    )
+
+    parser.add_argument(
+        "--notify",
+        action="store_true",
+        help="Send notification after scraping (e.g., Slack/Email/Whatsapp or Text).",
+    )
+
+    args = parser.parse_args()
+
+    asyncio.run(scrape_url(url, headless=args.headless, to_excel=args.to_excel))
No results found