mildred · October 20, 2014 10:03 · rudolphos · May 11, 2015 · hartator · Aug 10, 2015
diff --git a/download.sh b/download.sh
 #!/bin/bash

 url=http://redefininggod.com
 webarchive=https://web.archive.org
 wget="wget -e robots=off -nv"
 tab="$(printf '\t')"
 additional_url=url.list

 # Construct listing.txt from url.list
 # The list of archived pages, including some wildcard url
 # each line contains some fields separated by tabs:
 # - the last capture date (opaque format, if different, the last year index file
 #   will be redownloaded)
 # - the first capture year (hint for which is the oldest index to query)
 # - the last capture year (hint for which is the latest index to query)
 # - the url, starting with "/web/YYYYMMDDHHMMSS/" or "/web/*" (only) 

 : >listing.txt

 # Add url.list to listing.txt
 while read url; do
  
  if [[ -z "$url" ]]; then continue; fi
  
  if [[ $url != ${url#http*/web.archive.org} ]]; then
    url="${url#http*/web.archive.org}"
  elif [[ $url != ${url%/\*} ]]; then
    mkdir -p "$(dirname "./web/*/$url")"
    $wget "$webarchive/web/*/$url" -O "./web/*/$url.html"
    # <listing.html fgrep 'href="/web/' | cut -d'"' -f2 >listing.txt
    <"./web/*/$url.html" sed -r -e '
      /<table id="resultsUrl">/,/<\/table>/ {
        /a href/ {
          s/.*href="(.*)".*/\1/;
          h
        };
        /dateFrom/ {
          s/.*([0-9]{4})<\/td>.*/\1/;
          x;
          H
        };
        /dateTo/ {
          s/.*>(.*)([0-9]{4})<\/td>.*/\1\2\n\2/;
          x;
          H
        };
        /<\/tr>/ {
          x;
          s/(.*)\n(.*)\n(.*)\n(.*)/\1\t\3\t\2\t\4/;
          p
        }  
      };
      d' >"./web/*/$url.txt"
    cat "./web/*/$url.txt" >>listing.txt
    continue
  else
    url="/web/*/$url"
  fi
  
  printf "%s\t%s\t%s\t%s\n" "$(date)" 1996 2014 "$url" >>listing.txt
  
 done <"$additional_url"

 # Construct listing2.txt
 # Remove the wildcard url and fetch all the versions from index
 # Lines only contains the URL starting with "/web/YYYYMMDDHHMMSS/" (only)
 # It may contains duplicates
 : >listing2.txt

 while read line; do
  
  if [[ -z "$line" ]]; then continue; fi

  #printf "%s\n" "$line"
  
  oldifs="$IFS"
  IFS="$tab" elems=($line)
  IFS="$oldifs"
  lastcap="${elems[0]}"
  firstyear="${elems[1]}"
  lastyear="${elems[2]}"
  mainurl="${elems[3]}"

  #echo "Main URL: $firstyear->$lastyear $mainurl"

  if [[ $mainurl =~ '/web/*/' ]]; then
    listing="./$mainurl.txt"
    mkdir -p "$(dirname "$listing")"
    : >"$listing"
    oldlastcap="$(cat "./$mainurl.lastcap.txt" 2>/dev/null)"
    oldlastyear="$(cat "./$mainurl.lastyear.txt" 2>/dev/null)"
    : ${oldlastyear:=$lastyear}
    for y in $(seq $firstyear $lastyear); do
      u="/web/${y}0101000000*/${mainurl#/web/*/}"
      mkdir -p "$(dirname "./$u.html")"
      if ! [[ -s "./$u.html" ]] || ([[ $y -ge $oldlastyear ]] && [[ $lastcap != $oldlastcap ]]) ; then
        $wget "$webarchive$u" -O "./$u.html"
      fi
      #<"./$u.html" egrep 'href="/web/[0-9]+\*' | sed -r 's/.*href="([^"]*)".*/\1/' >"$d/$f.txt"
      <"./$u.html" egrep 'href="/web/[0-9]*/' | sed -r 's/.*href="([^"]*)".*/\1/' >>"$listing"
    done
    printf %s "$lastcap" >"./$mainurl.lastcap.txt"
    printf %s "$lastyear" >"./$mainurl.lastyear.txt"
    <"$listing" | sort | uniq >>listing2.txt
  else
    echo "$mainurl" >>listing2.txt
  fi

 done <listing.txt

 # Construct listing3.txt
 # sort, uniq, use unmodified page appending id_ to the timestamp
 # URL must start with "/web/YYYYMMDDHHMMSSid_/" only.
 # This is the list of files that needs to be downloaded (if not already present)

 <listing2.txt sort | uniq | sed -r 's:^/web/([0-9]*)/:/web/\1id_/:' >listing3.txt

 # Download listing3

 while read url; do

  if [[ $url != ${url%/} ]]; then
    f="./$url/.index"
  else
    f="./$url"
  fi
    
  mkdir -p "$(dirname "$f")"
  if ! [[ -s "$f" ]]; then
    $wget "$webarchive$url" -O "./$f"
  fi

 done <listing3.txt
	#!/bin/bash

	url=http://redefininggod.com
	webarchive=https://web.archive.org
	wget="wget -e robots=off -nv"
	tab="$(printf '\t')"
	additional_url=url.list

	# Construct listing.txt from url.list
	# The list of archived pages, including some wildcard url
	# each line contains some fields separated by tabs:
	# - the last capture date (opaque format, if different, the last year index file
	# will be redownloaded)
	# - the first capture year (hint for which is the oldest index to query)
	# - the last capture year (hint for which is the latest index to query)
	# - the url, starting with "/web/YYYYMMDDHHMMSS/" or "/web/*" (only)

	: >listing.txt

	# Add url.list to listing.txt
	while read url; do

	if [[ -z "$url" ]]; then continue; fi

	if [[ $url != ${url#http*/web.archive.org} ]]; then
	url="${url#http*/web.archive.org}"
	elif [[ $url != ${url%/\*} ]]; then
	mkdir -p "$(dirname "./web/*/$url")"
	$wget "$webarchive/web//$url" -O "./web//$url.html"
	# <listing.html fgrep 'href="/web/' \| cut -d'"' -f2 >listing.txt
	<"./web/*/$url.html" sed -r -e '
	/<table id="resultsUrl">/,/<\/table>/ {
	/a href/ {
	s/.href="(.)".*/\1/;
	h
	};
	/dateFrom/ {
	s/.([0-9]{4})<\/td>./\1/;
	x;
	H
	};
	/dateTo/ {
	s/.>(.)([0-9]{4})<\/td>.*/\1\2\n\2/;
	x;
	H
	};
	/<\/tr>/ {
	x;
	s/(.)\n(.)\n(.)\n(.)/\1\t\3\t\2\t\4/;
	p
	}
	};
	d' >"./web/*/$url.txt"
	cat "./web/*/$url.txt" >>listing.txt
	continue
	else
	url="/web/*/$url"
	fi

	printf "%s\t%s\t%s\t%s\n" "$(date)" 1996 2014 "$url" >>listing.txt

	done <"$additional_url"

	# Construct listing2.txt
	# Remove the wildcard url and fetch all the versions from index
	# Lines only contains the URL starting with "/web/YYYYMMDDHHMMSS/" (only)
	# It may contains duplicates
	: >listing2.txt

	while read line; do

	if [[ -z "$line" ]]; then continue; fi

	#printf "%s\n" "$line"

	oldifs="$IFS"
	IFS="$tab" elems=($line)
	IFS="$oldifs"
	lastcap="${elems[0]}"
	firstyear="${elems[1]}"
	lastyear="${elems[2]}"
	mainurl="${elems[3]}"

	#echo "Main URL: $firstyear->$lastyear $mainurl"

	if [[ $mainurl =~ '/web/*/' ]]; then
	listing="./$mainurl.txt"
	mkdir -p "$(dirname "$listing")"
	: >"$listing"
	oldlastcap="$(cat "./$mainurl.lastcap.txt" 2>/dev/null)"
	oldlastyear="$(cat "./$mainurl.lastyear.txt" 2>/dev/null)"
	: ${oldlastyear:=$lastyear}
	for y in $(seq $firstyear $lastyear); do
	u="/web/${y}0101000000/${mainurl#/web//}"
	mkdir -p "$(dirname "./$u.html")"
	if ! [[ -s "./$u.html" ]] \|\| ([[ $y -ge $oldlastyear ]] && [[ $lastcap != $oldlastcap ]]) ; then
	$wget "$webarchive$u" -O "./$u.html"
	fi
	#<"./$u.html" egrep 'href="/web/[0-9]+\' \| sed -r 's/.href="([^"])"./\1/' >"$d/$f.txt"
	<"./$u.html" egrep 'href="/web/[0-9]/' \| sed -r 's/.href="([^"])"./\1/' >>"$listing"
	done
	printf %s "$lastcap" >"./$mainurl.lastcap.txt"
	printf %s "$lastyear" >"./$mainurl.lastyear.txt"
	<"$listing" \| sort \| uniq >>listing2.txt
	else
	echo "$mainurl" >>listing2.txt
	fi

	done <listing.txt

	# Construct listing3.txt
	# sort, uniq, use unmodified page appending id_ to the timestamp
	# URL must start with "/web/YYYYMMDDHHMMSSid_/" only.
	# This is the list of files that needs to be downloaded (if not already present)

	<listing2.txt sort \| uniq \| sed -r 's:^/web/([0-9]*)/:/web/\1id_/:' >listing3.txt

	# Download listing3

	while read url; do

	if [[ $url != ${url%/} ]]; then
	f="./$url/.index"
	else
	f="./$url"
	fi

	mkdir -p "$(dirname "$f")"
	if ! [[ -s "$f" ]]; then
	$wget "$webarchive$url" -O "./$f"
	fi

	done <listing3.txt