local hp = require "gumbo" local b64 = require "base64" local banner = "_30q-" local docBody = "_39k5" local title = "_4lmk" local bold = "_4yxo" local italics = "_4yxp" local emoji = "_47e3" local hidden = "_7oe" local date = "_39g5" local namespace = "fbdoc" local pageUrl function treat(node, output) if node.nodeName == "#text" then table.insert(output, node.data or "") elseif node.nodeName == "DIV" or node.nodeName == "LI" or node.nodeName == "FIGURE" then for i,v in ipairs(node.childNodes) do treat(v, output) end table.insert(output, "\n\n") elseif node.nodeName == "SPAN" and node.className:match(bold) then table.insert(output, "**") for i,v in ipairs(node.childNodes) do treat(v, output) end table.insert(output, "**") elseif node.nodeName == "SPAN" and node.className:match(italics) then table.insert(output, "//") for i,v in ipairs(node.childNodes) do treat(v, output) end table.insert(output, "//") elseif node.nodeName == "A" then table.insert(output, "[[" .. node:getAttribute "href" .. "|") for i,v in ipairs(node.childNodes) do treat(v, output) end table.insert(output, "]]") elseif node.nodeName == "OL" then for i,v in ipairs(node.childNodes) do table.insert(output, " - ") treat(v, output) end elseif node.nodeName == "UL" then for i,v in ipairs(node.childNodes) do table.insert(output, " * ") treat(v, output) end elseif node.nodeName == "BR" then table.insert(output, "\\\\\n") elseif node.nodeName == "H2" then table.insert(output, "===== ") for i,v in ipairs(node.childNodes) do treat(v, output) end table.insert(output, " =====\n\n") elseif node.nodeName == "H3" then table.insert(output, "==== ") for i,v in ipairs(node.childNodes) do treat(v, output) end table.insert(output, " ====\n\n") elseif node.nodeName == "H4" then table.insert(output, "=== ") for i,v in ipairs(node.childNodes) do treat(v, output) end table.insert(output, " ===\n\n") elseif node.nodeName == "IMG" then treatImage(node, output) elseif node.nodeName == "SPAN" and node.className:match(emoji) then for i,v in ipairs(node.childNodes) do treat(v, output) end elseif node.nodeName == "SPAN" and node.className:match(hidden) then -- nothing! else print("Unhandled node", node.nodeName) print(node) end end function saveImage(data, path) local format,data = data:match("data:image/(%w-);base64,(.*)") local blob = b64.decode(data) local finalPath = path .. "." .. format local fp = io.open("img/" .. finalPath, "wb") fp:write(blob) fp:close() return finalPath end local imgCount = 0 function treatImage(node, out) imgCount = imgCount + 1 local data = node:getAttribute "src" local path = saveImage(data, pageUrl .. "-" .. imgCount) table.insert(out, "{{ " .. namespace .. ":" .. path .. " }}") end function extractImage(number, src, path) local data = src:match("%-%-savepage%-url%-" .. number .. ": url%((.-%))") return saveImage(data, path) end function getBanner(root, src) local elem = root:getElementsByClassName(banner)[1] if not elem then return end local ref = elem:getAttribute "style":match("%-%-savepage%-url%-(%w*)") return extractImage(ref, src, pageUrl .. "-banner") end local months = {"January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"} for i,v in ipairs(months) do months[v] = i end function treatDocument(path) imgCount = 0 local src = io.open(path, "r"):read "a" src = src:match('(.*)') src = src:gsub(""", '"') src = src:gsub("&", '&') src = src:gsub("<", '<') src = src:gsub(">", '>') src = src:gsub("'", "'") local body = src:match('') local root,err = hp.parse(body) if not root then error(err) end local out = {} -- Get title local titleElem = root:getElementsByClassName(title)[1] local title = titleElem.textContent table.insert(out, "====== " .. title .. " ======\n\n") -- Double dashes and final dashes break dokuwiki for some reason local url = title:lower():gsub("[^%w ]", ""):gsub(" ", "-") url = url:gsub("%-+", "-"):gsub("%-$", "") -- Get date local date = root:getElementsByClassName(date)[1].textContent local month, day, year = date:match("(%w+) (%d+), (%d+)") local isoDate = string.format("%04d-%02d-%02d", tonumber(year), months[month], tonumber(day)) -- Final URL pageUrl = isoDate .. "-" .. url print(" " .. pageUrl) -- Banner local bannerPath = getBanner(root, src) if bannerPath then table.insert(out, "{{ " .. namespace .. ":" .. bannerPath .. " }}\n\n") end -- Body local mainDoc = root:getElementsByClassName(docBody)[1] treat(mainDoc, out) local fp = io.open("pages/" .. pageUrl .. ".txt", "w") fp:write(table.concat(out)) fp:close() end do local args = {...} for i,v in ipairs(args) do print(i,v) treatDocument(v) end end