require "fileutils"
require "httparty"
require "json"
require "nokogiri"
require "open-uri"
DOMAIN = "http://www.warplane.com/"
# strip domain from URL
def strip_domain(url)
url.gsub(DOMAIN,'')
end
# setup planes array of hashes
def get_planes_array(page)
page.css(".div-button a").reject { |link|
link["href"] == "#"
}.map{ |link|
{
"link" => link["href"],
"thumbnail" => strip_domain(link.css("img")[0]["src"]),
"name" => link.css("b")[0].text,
}
}
end
# get Nokogiri page obj of a plane's detail page
def get_detail_page(plane)
Nokogiri::HTML(open(plane["link"]))
end
# download gallery photos for specific plane and get filename info
def get_gallery_photos(detail_page)
begin
images = detail_page.css("table[id*=PhotosDataList] a").map { |link|
new_image = {
"src" => strip_domain(link["href"]),
"description" => link["title"]
}
# some images have no thumbnail, so we test against a regex
# which captures the contents of the inline background image
if link.css("div")[0]["style"] =~ /\((.*)\)/
new_image["thumbnail"] = strip_domain($1)
end
new_image
}
images
rescue => e
puts e.message
end
end
# get detailed plane information and add it to
def get_plane_specs(detail_page)
# get sidebar
sidebar = detail_page.css("td[style]").last
specs = {}
spec_strings = sidebar.to_s.split("
").select { |str| str.match // }
spec_strings.each do |s|
matches = s.match /(.+):<\/b>(.*)/
specs[matches[1].downcase.gsub(" ","_")] = matches[2].strip
end
specs
end
def download_plane_image(path)
localpath = File.expand_path path
FileUtils.mkdir_p File.dirname(localpath)
File.open(localpath, "wb") do |f|
puts "Downloading #{DOMAIN}#{path}"
f.write HTTParty.get(DOMAIN+path).parsed_response
end
end
# writes json file of plane info
def write_json_file(planes)
File.open("planes.json","wb") { |f| f.write(JSON.pretty_generate(planes))}
puts "Wrote planes to file"
end
# "main"
page = Nokogiri::HTML(open("http://www.warplane.com/warplane-vintage-aircraft-collection.aspx"))
planes = get_planes_array(page)
planes.each do |plane|
detail_page = get_detail_page plane
plane["images"] = get_gallery_photos detail_page
plane["specs"] = get_plane_specs detail_page
thread_list = []
plane["images"].each do |image|
thread_list << Thread.new {
download_plane_image(plane["thumbnail"])
download_plane_image(image["src"])
if image["thumbnail"]
download_plane_image(image["thumbnail"])
end
}
end
thread_list.each {|x| x.join}
end
write_json_file planes