#!/usr/bin/env ruby # From: http://ngauthier.com/2014/06/scraping-the-web-with-ruby.html require 'capybara' require 'capybara/poltergeist' require 'csv' require 'gdbm' class NickBot include Capybara::DSL def initialize(io = STDOUT) Capybara.default_driver = :poltergeist @io = io end def scrape visit "http://ngauthier.com/" all(".posts .post").each do |post| article = Article.from_summary(post) next unless article.new_record? article.save end Article.each do |article| next if article.body visit "http://ngauthier.com#{article.url}" has_content?(article.title) or raise "couldn't load #{url}" article.body = find("article").text article.save end CSV(@io) do |csv| csv << ["Title", "URL", "Date", "Summary", "Body"] Article.each do |article| csv << [ article.title, article.url, article.date, article.summary, article.body ] end end end class Article < OpenStruct DB = GDBM.new("articles.db") def self.from_summary(node) new( title: node.find("h3 a").text, url: node.find("h3 a")["href"], date: node.find("h3 small").text, summary: node.find("p.preview").text, ) end def self.each DB.each do |url, json| yield Article.new(JSON.load(json)) end end def save DB[url] = to_h.to_json end def new_record? DB[url].nil? end end end NickBot.new(STDOUT).scrape