#!/usr/bin/ruby # # Download lecture videos of courses from Coursera (http://www.coursera.org). # # Install requirements: # # $ gem install curb trollop nokogiri capybara ruby-progressbar # # Example -- Download all video lectures of courses "Calculus: Single Variable" # and "Introduction to Astronomy": # # $ ruby coursera-download-videos.rb | # --email=user@server.org --password=123456 \ # "Calculus: Single Variable" "Introduction to Astronomy" # # Contact: tokland@gmail.com require 'rubygems' require 'fileutils' require 'logger' require 'set' require 'progressbar' require 'capybara' require 'nokogiri' require 'trollop' require 'curl' ### Extensions class Hash # Keep only given keys in hash def slice(*keys) keys_set = keys.to_set select { |k, v| keys_set.include?(k) } end # Reverse update hash with a given default hash (performs key checking) def defaults(default_hash) unknown_options = self.keys - default_hash.keys if unknown_options.empty? replace(default_hash.merge(self)) else raise ArgumentError.new("unknown key(s): #{unknown_options.join(', ')}") end end end class MaybeWrapper instance_methods.each { |m| undef_method(m) unless m == :object_id || m =~ /^__/ } def method_missing(*args, &block) nil end end class Object def maybe(&block) if block_given? nil? ? nil : yield(self) else nil? ? MaybeWrapper.new : self end end end module Curl def self.download_to_file(url, destination, options = {}) options.defaults(:progressbar => false, :cookies => []) open(destination, "wb") do |fd| curl = Curl::Easy.new(url).tap do |c| c.follow_location = true c.enable_cookies = true if options[:cookies] c.headers["Cookie"] = options[:cookies].join("; ") end c.on_body { |data| fd.write(data) } end if options[:progressbar] title = File.basename(destination, File.extname(destination)) state = {:pbar => nil, :dl_total => nil} curl.on_progress do |dl_total, dl_now, ul_total, ul_now| if dl_total > 0 if !state[:pbar] || (!state[:dl_total] || dl_total > state[:pbar]) state[:dl_total] = dl_total state[:pbar] = ProgressBar.new(title, dl_total) state[:pbar].format_arguments = [:title, :percentage, :bar, :stat_for_file_transfer] end state[:pbar].set(dl_now) end true end curl.perform state[:pbar].finish if state[:pbar] else curl.perform end end end end class CapybaraBrowser include Capybara::DSL def initialize(options = {}) options.defaults(:driver => :selenium, :default_wait_time => 60) Capybara.current_driver = options.fetch(:driver) Capybara.default_wait_time = options.fetch(:default_wait_time) Capybara.run_server = false end def cookies driver = Capybara.current_session.driver case driver.class.name when "Capybara::Webkit::Driver" driver.browser.get_cookies.map { |s| s.split(";").first } when "Capybara::Selenium::Driver" driver.browser.manage.all_cookies.map { |c| [c[:name], c[:value]].join("=") } else raise ArgumentError.new("Unsupported driver: #{driver.class.name}") end end end ### Application class Coursera attr_reader :browser, :logger SessionError = Class.new(StandardError) ParserError = Class.new(StandardError) def initialize(options = {}) @browser = CapybaraBrowser.new @logger = Logger.new(STDERR) @logger.formatter = proc { |severity, datetime, progname, msg| "#{msg}\n" } end def login(email, password) logger.debug("Login: email='#{email}' password='#{'*' * password.size}'") browser.visit("https://www.coursera.org/account/signin") browser.fill_in("signin-email", :with => email) browser.fill_in("signin-password", :with => password) browser.click_button("Sign In") browser.find(".coursera-header-account-name") end def download_videos(course_name, options = {}) options.defaults(:destination_directory => nil) browser.visit("https://www.coursera.org/") if !browser.find(".coursera-header-account-name") raise SessionError.new("Not logged in") else browser.click_link(course_name) browser.find(:xpath, '//*[@class="course-navbar-item"]/a[contains(@href, "lecture/index")]').click browser.find("#spark") get_videos_from_course_page(browser.html).map do |info| course, section, index, lecture, url = info directory = File.join([options[:destination_directory], course, section].compact) path = File.join(directory, "%02d - %s.mp4" % [index+1, lecture]) logger.debug("Download video: #{path}") safe_download(url, path, browser.cookies) path end end end private def get_videos_from_course_page(html) doc = Nokogiri::HTML(html) course_name = doc.at_css("h1").maybe.text or raise ParserError.new("Cannot find course name") logger.debug("Course '#{course_name}'") # Some courses have the videos in reverse order, detect this case # with a simple heuristics (check if the first integers in the section # titles are in descending order). lis = doc.css(".course-item-list-header") ns = lis.map { |li| li.text.match(/\d+/).maybe[0].maybe.to_i }.compact is_reversed = !ns.empty? && ns.each_cons(2).all? { |x, y| x > y } ordered_lis = is_reversed ? lis.reverse : lis ordered_lis.flat_map.with_index do |section, section_index| h3 = section.at_css("h3") or raise ParserError.new("Cannot find h3") section_title = h3.text.gsub(/[[:space:]]+/, ' ').strip section_name = "%02d - %s" % [section_index+1, section_title] lecture_video_list = section.next or raise ParserError.new("Cannt find lecture video list") lecture_video_list.css("li").map.with_index do |lecture, index| link = lecture.at_css("a.lecture-link") or raise ParserError.new("Cannot find lecture link") lecture_title = link.text.gsub("/", "-").strip url = lecture.at_css("a[title='Video (MP4)']").maybe["href"] or raise ParserError.new("Cannot find video link") [course_name, section_name, index, lecture_title, url] end end end def safe_download(url, path, cookies) unless File.exists?(path) FileUtils.mkpath(File.dirname(path)) temp_path = path + ".partial" begin Curl.download_to_file(url, temp_path, :progressbar => true, :cookies => cookies) FileUtils.copy(temp_path, path) ensure FileUtils.safe_unlink(temp_path) end end end end if __FILE__ == $0 options = Trollop.options do banner "Usage: download-coursera-videos [OPTIONS] COURSENAME [COURSENAME2 ...]" opt :destination_directory, "Directory destination", :type => :string opt :email, "Email", :type => :string, :required => true opt :password, "Password", :type => :string, :required => true end if ARGV.empty? Trollop.die("Specify at least one course to download") else coursera = Coursera.new coursera.login(options[:email], options[:password]) download_options = options.slice(:destination_directory) ARGV.each do |course_name| coursera.download_videos(course_name, download_options) end end end