require "fileutils" require "httparty" require "json" require "nokogiri" require "open-uri" DOMAIN = "http://www.warplane.com/" # strip domain from URL def strip_domain(url) url.gsub(DOMAIN,'') end # setup planes array of hashes def get_planes_array(page) page.css(".div-button a").reject { |link| link["href"] == "#" }.map{ |link| { "link" => link["href"], "thumbnail" => strip_domain(link.css("img")[0]["src"]), "name" => link.css("b")[0].text, } } end # get Nokogiri page obj of a plane's detail page def get_detail_page(plane) Nokogiri::HTML(open(plane["link"])) end # download gallery photos for specific plane and get filename info def get_gallery_photos(detail_page) begin images = detail_page.css("table[id*=PhotosDataList] a").map { |link| new_image = { "src" => strip_domain(link["href"]), "description" => link["title"] } # some images have no thumbnail, so we test against a regex # which captures the contents of the inline background image if link.css("div")[0]["style"] =~ /\((.*)\)/ new_image["thumbnail"] = strip_domain($1) end new_image } images rescue => e puts e.message end end # get detailed plane information and add it to def get_plane_specs(detail_page) # get sidebar sidebar = detail_page.css("td[style]").last specs = {} spec_strings = sidebar.to_s.split("
").select { |str| str.match // } spec_strings.each do |s| matches = s.match /(.+):<\/b>(.*)/ specs[matches[1].downcase.gsub(" ","_")] = matches[2].strip end specs end def download_plane_image(path) localpath = File.expand_path path FileUtils.mkdir_p File.dirname(localpath) File.open(localpath, "wb") do |f| puts "Downloading #{DOMAIN}#{path}" f.write HTTParty.get(DOMAIN+path).parsed_response end end # writes json file of plane info def write_json_file(planes) File.open("planes.json","wb") { |f| f.write(JSON.pretty_generate(planes))} puts "Wrote planes to file" end # "main" page = Nokogiri::HTML(open("http://www.warplane.com/warplane-vintage-aircraft-collection.aspx")) planes = get_planes_array(page) planes.each do |plane| detail_page = get_detail_page plane plane["images"] = get_gallery_photos detail_page plane["specs"] = get_plane_specs detail_page thread_list = [] plane["images"].each do |image| thread_list << Thread.new { download_plane_image(plane["thumbnail"]) download_plane_image(image["src"]) if image["thumbnail"] download_plane_image(image["thumbnail"]) end } end thread_list.each {|x| x.join} end write_json_file planes