Skip to content

Instantly share code, notes, and snippets.

@wlib
Created August 6, 2025 01:42
Show Gist options
  • Save wlib/1ec790c2ee8367ad97cab4f499ad5eab to your computer and use it in GitHub Desktop.
Save wlib/1ec790c2ee8367ad97cab4f499ad5eab to your computer and use it in GitHub Desktop.

Revisions

  1. wlib created this gist Aug 6, 2025.
    198 changes: 198 additions & 0 deletions scrape-amazon-orders.mjs
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,198 @@
    const ordersUrl = "https://www.amazon.com/your-orders/orders"

    const wait = (ms = 1_000) =>
    new Promise(resolve => {
    setTimeout(() =>{
    resolve()
    }, ms)
    })

    const fetchDocument = async url => {
    const html = await fetch(url).then(res => res.ok ? res.text() : undefined)
    if (!html)
    return

    return Document.parseHTMLUnsafe(html)
    }

    const fetchDocumentViaIframe = async url =>
    new Promise(resolve => {
    const iframe = document.createElement("iframe")
    iframe.addEventListener("load", () => {
    const html = iframe.contentDocument.documentElement.outerHTML
    resolve(Document.parseHTMLUnsafe(html))
    })
    iframe.src = url
    })

    const ordersPage = await fetchDocument(ordersUrl)

    const ordersUrlsByYear =
    [...ordersPage.querySelector("select[name = timeFilter]").options]
    .flatMap(option => {
    if (!option.value.startsWith("year-"))
    return []

    const url = new URL(ordersUrl)
    url.searchParams.set("timeFilter", option.value)
    return [url.href]
    })

    const orderIds = new Set()

    const extractOrderIds = (doc, baseUrl) => {
    for (const a of doc.querySelectorAll("a")) {
    let url
    try {
    url = new URL(a.href, baseUrl)
    }
    catch {
    continue
    }

    if (!url.href.startsWith("https://www.amazon.com/gp/css/summary/print.html"))
    continue

    const orderId = url.searchParams.get("orderID")
    if (!orderId)
    continue

    orderIds.add(orderId)
    }
    }

    for (const ordersPageUrl of ordersUrlsByYear) {
    const ordersPage = await fetchDocumentViaIframe(ordersPageUrl)
    await wait()

    extractOrderIds(ordersPage, ordersPageUrl)

    const maxStartIndex = [...ordersPage.querySelectorAll(".a-pagination a")]
    .flatMap(a => {
    let url
    try {
    url = new URL(a.href, ordersPageUrl)
    }
    catch {
    return []
    }

    if (!url.href.startsWith(ordersUrl))
    return []

    const startIndex = parseInt(url.searchParams.get("startIndex"))
    if (isNaN(startIndex))
    return []

    return [startIndex]
    })
    .sort((a, b) => b - a)[0]
    if (!maxStartIndex)
    continue

    const otherPageUrls = Array.from({ length: maxStartIndex / 10 })
    .map((_, i) => {
    const startIndex = (i + 1) * 10
    const url = new URL(ordersPageUrl)
    url.searchParams.set("startIndex", startIndex)
    return url.href
    })

    for (const otherPageUrl of otherPageUrls) {
    const otherPage = await fetchDocumentViaIframe(otherPageUrl)
    await wait()

    extractOrderIds(otherPage, ordersPageUrl)
    }
    }

    const extractModernInvoiceInfo = (invoicePage, baseUrl) => {
    const orderDate = invoicePage.querySelector("[data-component = orderDate]")?.textContent.trim()
    const items = [...invoicePage.querySelectorAll("[data-component = purchasedItems] .a-fixed-left-grid")]
    .map(purchasedItem => {
    const imageContainer = purchasedItem.querySelector("[data-component = itemImage]")
    let image
    try {
    image = new URL(imageContainer?.querySelector("img")?.getAttribute("src"), baseUrl).href
    }
    catch {}
    let quantity = parseInt(imageContainer?.querySelector(".od-item-view-qty")?.textContent.trim())
    if (isNaN(quantity))
    quantity = undefined

    const titleLink = purchasedItem.querySelector("[data-component = itemTitle] a")
    const title = titleLink?.textContent.trim()
    let url
    try {
    const url_ = new URL(titleLink.href, baseUrl)
    url_.search = ""
    url = url_.href
    }
    catch {}

    const unitPrice = purchasedItem.querySelector("[data-component = unitPrice] .a-offscreen")?.textContent.trim()

    return { image, title, url, unitPrice, quantity }
    })

    return { orderDate, items }
    }

    const extractLegacyInvoiceInfo = (invoicePage, baseUrl) => {
    const orderDateCell = [...invoicePage.querySelectorAll("td:not(:has(td))")]
    .find(td => td.textContent.includes("Order Placed:"))
    const orderDate = orderDateCell?.textContent.replace("Order Placed:", "").trim()

    const rawItems = [...invoicePage.querySelectorAll("tr")]
    .flatMap(row => {
    const italicText = row.querySelector("i")
    const priceCell = row.querySelector("td[align = right]")
    if (!(italicText && priceCell && priceCell.textContent.includes("$")))
    return []

    let quantity = parseInt(italicText.previousSibling?.textContent.match(/(\d+)\s+of:/)?.[1])
    if (isNaN(quantity))
    quantity = 1

    const title = italicText.textContent.trim()

    const unitPrice = priceCell.textContent.trim()

    return [{
    title,
    unitPrice,
    quantity
    }]
    })

    const itemsMap = new Map()

    for (const item of rawItems) {
    const key = `${item.title}\0${item.unitPrice}`

    if (itemsMap.has(key))
    itemsMap.get(key).quantity += item.quantity
    else
    itemsMap.set(key, { ...item })
    }

    const items = [...itemsMap.values()]

    return { orderDate, items }
    }

    const orderIdToInfo = {}

    for (const orderId of orderIds) {
    const invoiceUrl = new URL("https://www.amazon.com/gp/css/summary/print.html")
    invoiceUrl.searchParams.set("orderID", orderId)
    const invoicePage = await fetchDocument(invoiceUrl)
    await wait()

    orderIdToInfo[orderId] =
    invoicePage.querySelector("[data-component = orderDate]")
    ? extractModernInvoiceInfo(invoicePage, invoiceUrl)
    : extractLegacyInvoiceInfo(invoicePage, invoiceUrl)
    }

    console.log(orderIdToInfo)