Skip to content

Instantly share code, notes, and snippets.

@mildred
Created October 20, 2014 10:03
Show Gist options
  • Save mildred/7a33bb9c263f025b59e6 to your computer and use it in GitHub Desktop.
Save mildred/7a33bb9c263f025b59e6 to your computer and use it in GitHub Desktop.

Revisions

  1. mildred created this gist Oct 20, 2014.
    133 changes: 133 additions & 0 deletions download.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,133 @@
    #!/bin/bash

    url=http://redefininggod.com
    webarchive=https://web.archive.org
    wget="wget -e robots=off -nv"
    tab="$(printf '\t')"
    additional_url=url.list

    # Construct listing.txt from url.list
    # The list of archived pages, including some wildcard url
    # each line contains some fields separated by tabs:
    # - the last capture date (opaque format, if different, the last year index file
    # will be redownloaded)
    # - the first capture year (hint for which is the oldest index to query)
    # - the last capture year (hint for which is the latest index to query)
    # - the url, starting with "/web/YYYYMMDDHHMMSS/" or "/web/*" (only)

    : >listing.txt

    # Add url.list to listing.txt
    while read url; do

    if [[ -z "$url" ]]; then continue; fi

    if [[ $url != ${url#http*/web.archive.org} ]]; then
    url="${url#http*/web.archive.org}"
    elif [[ $url != ${url%/\*} ]]; then
    mkdir -p "$(dirname "./web/*/$url")"
    $wget "$webarchive/web/*/$url" -O "./web/*/$url.html"
    # <listing.html fgrep 'href="/web/' | cut -d'"' -f2 >listing.txt
    <"./web/*/$url.html" sed -r -e '
    /<table id="resultsUrl">/,/<\/table>/ {
    /a href/ {
    s/.*href="(.*)".*/\1/;
    h
    };
    /dateFrom/ {
    s/.*([0-9]{4})<\/td>.*/\1/;
    x;
    H
    };
    /dateTo/ {
    s/.*>(.*)([0-9]{4})<\/td>.*/\1\2\n\2/;
    x;
    H
    };
    /<\/tr>/ {
    x;
    s/(.*)\n(.*)\n(.*)\n(.*)/\1\t\3\t\2\t\4/;
    p
    }
    };
    d' >"./web/*/$url.txt"
    cat "./web/*/$url.txt" >>listing.txt
    continue
    else
    url="/web/*/$url"
    fi

    printf "%s\t%s\t%s\t%s\n" "$(date)" 1996 2014 "$url" >>listing.txt

    done <"$additional_url"

    # Construct listing2.txt
    # Remove the wildcard url and fetch all the versions from index
    # Lines only contains the URL starting with "/web/YYYYMMDDHHMMSS/" (only)
    # It may contains duplicates
    : >listing2.txt

    while read line; do

    if [[ -z "$line" ]]; then continue; fi

    #printf "%s\n" "$line"

    oldifs="$IFS"
    IFS="$tab" elems=($line)
    IFS="$oldifs"
    lastcap="${elems[0]}"
    firstyear="${elems[1]}"
    lastyear="${elems[2]}"
    mainurl="${elems[3]}"

    #echo "Main URL: $firstyear->$lastyear $mainurl"

    if [[ $mainurl =~ '/web/*/' ]]; then
    listing="./$mainurl.txt"
    mkdir -p "$(dirname "$listing")"
    : >"$listing"
    oldlastcap="$(cat "./$mainurl.lastcap.txt" 2>/dev/null)"
    oldlastyear="$(cat "./$mainurl.lastyear.txt" 2>/dev/null)"
    : ${oldlastyear:=$lastyear}
    for y in $(seq $firstyear $lastyear); do
    u="/web/${y}0101000000*/${mainurl#/web/*/}"
    mkdir -p "$(dirname "./$u.html")"
    if ! [[ -s "./$u.html" ]] || ([[ $y -ge $oldlastyear ]] && [[ $lastcap != $oldlastcap ]]) ; then
    $wget "$webarchive$u" -O "./$u.html"
    fi
    #<"./$u.html" egrep 'href="/web/[0-9]+\*' | sed -r 's/.*href="([^"]*)".*/\1/' >"$d/$f.txt"
    <"./$u.html" egrep 'href="/web/[0-9]*/' | sed -r 's/.*href="([^"]*)".*/\1/' >>"$listing"
    done
    printf %s "$lastcap" >"./$mainurl.lastcap.txt"
    printf %s "$lastyear" >"./$mainurl.lastyear.txt"
    <"$listing" | sort | uniq >>listing2.txt
    else
    echo "$mainurl" >>listing2.txt
    fi

    done <listing.txt

    # Construct listing3.txt
    # sort, uniq, use unmodified page appending id_ to the timestamp
    # URL must start with "/web/YYYYMMDDHHMMSSid_/" only.
    # This is the list of files that needs to be downloaded (if not already present)

    <listing2.txt sort | uniq | sed -r 's:^/web/([0-9]*)/:/web/\1id_/:' >listing3.txt

    # Download listing3

    while read url; do

    if [[ $url != ${url%/} ]]; then
    f="./$url/.index"
    else
    f="./$url"
    fi

    mkdir -p "$(dirname "$f")"
    if ! [[ -s "$f" ]]; then
    $wget "$webarchive$url" -O "./$f"
    fi

    done <listing3.txt