mildred · October 20, 2014 10:03 · Oct 20, 2014
diff --git a/download.sh b/download.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+
+url=http://redefininggod.com
+webarchive=https://web.archive.org
+wget="wget -e robots=off -nv"
+tab="$(printf '\t')"
+additional_url=url.list
+
+# Construct listing.txt from url.list
+# The list of archived pages, including some wildcard url
+# each line contains some fields separated by tabs:
+# - the last capture date (opaque format, if different, the last year index file
+#   will be redownloaded)
+# - the first capture year (hint for which is the oldest index to query)
+# - the last capture year (hint for which is the latest index to query)
+# - the url, starting with "/web/YYYYMMDDHHMMSS/" or "/web/*" (only) 
+
+: >listing.txt
+
+# Add url.list to listing.txt
+while read url; do
+
+  if [[ -z "$url" ]]; then continue; fi
+
+  if [[ $url != ${url#http*/web.archive.org} ]]; then
+    url="${url#http*/web.archive.org}"
+  elif [[ $url != ${url%/\*} ]]; then
+    mkdir -p "$(dirname "./web/*/$url")"
+    $wget "$webarchive/web/*/$url" -O "./web/*/$url.html"
+    # <listing.html fgrep 'href="/web/' | cut -d'"' -f2 >listing.txt
+    <"./web/*/$url.html" sed -r -e '
+      /<table id="resultsUrl">/,/<\/table>/ {
+        /a href/ {
+          s/.*href="(.*)".*/\1/;
+          h
+        };
+        /dateFrom/ {
+          s/.*([0-9]{4})<\/td>.*/\1/;
+          x;
+          H
+        };
+        /dateTo/ {
+          s/.*>(.*)([0-9]{4})<\/td>.*/\1\2\n\2/;
+          x;
+          H
+        };
+        /<\/tr>/ {
+          x;
+          s/(.*)\n(.*)\n(.*)\n(.*)/\1\t\3\t\2\t\4/;
+          p
+        }  
+      };
+      d' >"./web/*/$url.txt"
+    cat "./web/*/$url.txt" >>listing.txt
+    continue
+  else
+    url="/web/*/$url"
+  fi
+
+  printf "%s\t%s\t%s\t%s\n" "$(date)" 1996 2014 "$url" >>listing.txt
+
+done <"$additional_url"
+
+# Construct listing2.txt
+# Remove the wildcard url and fetch all the versions from index
+# Lines only contains the URL starting with "/web/YYYYMMDDHHMMSS/" (only)
+# It may contains duplicates
+: >listing2.txt
+
+while read line; do
+
+  if [[ -z "$line" ]]; then continue; fi
+
+  #printf "%s\n" "$line"
+
+  oldifs="$IFS"
+  IFS="$tab" elems=($line)
+  IFS="$oldifs"
+  lastcap="${elems[0]}"
+  firstyear="${elems[1]}"
+  lastyear="${elems[2]}"
+  mainurl="${elems[3]}"
+
+  #echo "Main URL: $firstyear->$lastyear $mainurl"
+
+  if [[ $mainurl =~ '/web/*/' ]]; then
+    listing="./$mainurl.txt"
+    mkdir -p "$(dirname "$listing")"
+    : >"$listing"
+    oldlastcap="$(cat "./$mainurl.lastcap.txt" 2>/dev/null)"
+    oldlastyear="$(cat "./$mainurl.lastyear.txt" 2>/dev/null)"
+    : ${oldlastyear:=$lastyear}
+    for y in $(seq $firstyear $lastyear); do
+      u="/web/${y}0101000000*/${mainurl#/web/*/}"
+      mkdir -p "$(dirname "./$u.html")"
+      if ! [[ -s "./$u.html" ]] || ([[ $y -ge $oldlastyear ]] && [[ $lastcap != $oldlastcap ]]) ; then
+        $wget "$webarchive$u" -O "./$u.html"
+      fi
+      #<"./$u.html" egrep 'href="/web/[0-9]+\*' | sed -r 's/.*href="([^"]*)".*/\1/' >"$d/$f.txt"
+      <"./$u.html" egrep 'href="/web/[0-9]*/' | sed -r 's/.*href="([^"]*)".*/\1/' >>"$listing"
+    done
+    printf %s "$lastcap" >"./$mainurl.lastcap.txt"
+    printf %s "$lastyear" >"./$mainurl.lastyear.txt"
+    <"$listing" | sort | uniq >>listing2.txt
+  else
+    echo "$mainurl" >>listing2.txt
+  fi
+
+done <listing.txt
+
+# Construct listing3.txt
+# sort, uniq, use unmodified page appending id_ to the timestamp
+# URL must start with "/web/YYYYMMDDHHMMSSid_/" only.
+# This is the list of files that needs to be downloaded (if not already present)
+
+<listing2.txt sort | uniq | sed -r 's:^/web/([0-9]*)/:/web/\1id_/:' >listing3.txt
+
+# Download listing3
+
+while read url; do
+
+  if [[ $url != ${url%/} ]]; then
+    f="./$url/.index"
+  else
+    f="./$url"
+  fi
+
+  mkdir -p "$(dirname "$f")"
+  if ! [[ -s "$f" ]]; then
+    $wget "$webarchive$url" -O "./$f"
+  fi
+
+done <listing3.txt