Created
October 20, 2014 10:03
-
-
Save mildred/7a33bb9c263f025b59e6 to your computer and use it in GitHub Desktop.
Revisions
-
mildred created this gist
Oct 20, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,133 @@ #!/bin/bash url=http://redefininggod.com webarchive=https://web.archive.org wget="wget -e robots=off -nv" tab="$(printf '\t')" additional_url=url.list # Construct listing.txt from url.list # The list of archived pages, including some wildcard url # each line contains some fields separated by tabs: # - the last capture date (opaque format, if different, the last year index file # will be redownloaded) # - the first capture year (hint for which is the oldest index to query) # - the last capture year (hint for which is the latest index to query) # - the url, starting with "/web/YYYYMMDDHHMMSS/" or "/web/*" (only) : >listing.txt # Add url.list to listing.txt while read url; do if [[ -z "$url" ]]; then continue; fi if [[ $url != ${url#http*/web.archive.org} ]]; then url="${url#http*/web.archive.org}" elif [[ $url != ${url%/\*} ]]; then mkdir -p "$(dirname "./web/*/$url")" $wget "$webarchive/web/*/$url" -O "./web/*/$url.html" # <listing.html fgrep 'href="/web/' | cut -d'"' -f2 >listing.txt <"./web/*/$url.html" sed -r -e ' /<table id="resultsUrl">/,/<\/table>/ { /a href/ { s/.*href="(.*)".*/\1/; h }; /dateFrom/ { s/.*([0-9]{4})<\/td>.*/\1/; x; H }; /dateTo/ { s/.*>(.*)([0-9]{4})<\/td>.*/\1\2\n\2/; x; H }; /<\/tr>/ { x; s/(.*)\n(.*)\n(.*)\n(.*)/\1\t\3\t\2\t\4/; p } }; d' >"./web/*/$url.txt" cat "./web/*/$url.txt" >>listing.txt continue else url="/web/*/$url" fi printf "%s\t%s\t%s\t%s\n" "$(date)" 1996 2014 "$url" >>listing.txt done <"$additional_url" # Construct listing2.txt # Remove the wildcard url and fetch all the versions from index # Lines only contains the URL starting with "/web/YYYYMMDDHHMMSS/" (only) # It may contains duplicates : >listing2.txt while read line; do if [[ -z "$line" ]]; then continue; fi #printf "%s\n" "$line" oldifs="$IFS" IFS="$tab" elems=($line) IFS="$oldifs" lastcap="${elems[0]}" firstyear="${elems[1]}" lastyear="${elems[2]}" mainurl="${elems[3]}" #echo "Main URL: $firstyear->$lastyear $mainurl" if [[ $mainurl =~ '/web/*/' ]]; then listing="./$mainurl.txt" mkdir -p "$(dirname "$listing")" : >"$listing" oldlastcap="$(cat "./$mainurl.lastcap.txt" 2>/dev/null)" oldlastyear="$(cat "./$mainurl.lastyear.txt" 2>/dev/null)" : ${oldlastyear:=$lastyear} for y in $(seq $firstyear $lastyear); do u="/web/${y}0101000000*/${mainurl#/web/*/}" mkdir -p "$(dirname "./$u.html")" if ! [[ -s "./$u.html" ]] || ([[ $y -ge $oldlastyear ]] && [[ $lastcap != $oldlastcap ]]) ; then $wget "$webarchive$u" -O "./$u.html" fi #<"./$u.html" egrep 'href="/web/[0-9]+\*' | sed -r 's/.*href="([^"]*)".*/\1/' >"$d/$f.txt" <"./$u.html" egrep 'href="/web/[0-9]*/' | sed -r 's/.*href="([^"]*)".*/\1/' >>"$listing" done printf %s "$lastcap" >"./$mainurl.lastcap.txt" printf %s "$lastyear" >"./$mainurl.lastyear.txt" <"$listing" | sort | uniq >>listing2.txt else echo "$mainurl" >>listing2.txt fi done <listing.txt # Construct listing3.txt # sort, uniq, use unmodified page appending id_ to the timestamp # URL must start with "/web/YYYYMMDDHHMMSSid_/" only. # This is the list of files that needs to be downloaded (if not already present) <listing2.txt sort | uniq | sed -r 's:^/web/([0-9]*)/:/web/\1id_/:' >listing3.txt # Download listing3 while read url; do if [[ $url != ${url%/} ]]; then f="./$url/.index" else f="./$url" fi mkdir -p "$(dirname "$f")" if ! [[ -s "$f" ]]; then $wget "$webarchive$url" -O "./$f" fi done <listing3.txt