Skip to content

Instantly share code, notes, and snippets.

@mildred
Created October 20, 2014 10:03
Show Gist options
  • Save mildred/7a33bb9c263f025b59e6 to your computer and use it in GitHub Desktop.
Save mildred/7a33bb9c263f025b59e6 to your computer and use it in GitHub Desktop.
Download from archive.org Wayback Machine
#!/bin/bash
url=http://redefininggod.com
webarchive=https://web.archive.org
wget="wget -e robots=off -nv"
tab="$(printf '\t')"
additional_url=url.list
# Construct listing.txt from url.list
# The list of archived pages, including some wildcard url
# each line contains some fields separated by tabs:
# - the last capture date (opaque format, if different, the last year index file
# will be redownloaded)
# - the first capture year (hint for which is the oldest index to query)
# - the last capture year (hint for which is the latest index to query)
# - the url, starting with "/web/YYYYMMDDHHMMSS/" or "/web/*" (only)
: >listing.txt
# Add url.list to listing.txt
while read url; do
if [[ -z "$url" ]]; then continue; fi
if [[ $url != ${url#http*/web.archive.org} ]]; then
url="${url#http*/web.archive.org}"
elif [[ $url != ${url%/\*} ]]; then
mkdir -p "$(dirname "./web/*/$url")"
$wget "$webarchive/web/*/$url" -O "./web/*/$url.html"
# <listing.html fgrep 'href="/web/' | cut -d'"' -f2 >listing.txt
<"./web/*/$url.html" sed -r -e '
/<table id="resultsUrl">/,/<\/table>/ {
/a href/ {
s/.*href="(.*)".*/\1/;
h
};
/dateFrom/ {
s/.*([0-9]{4})<\/td>.*/\1/;
x;
H
};
/dateTo/ {
s/.*>(.*)([0-9]{4})<\/td>.*/\1\2\n\2/;
x;
H
};
/<\/tr>/ {
x;
s/(.*)\n(.*)\n(.*)\n(.*)/\1\t\3\t\2\t\4/;
p
}
};
d' >"./web/*/$url.txt"
cat "./web/*/$url.txt" >>listing.txt
continue
else
url="/web/*/$url"
fi
printf "%s\t%s\t%s\t%s\n" "$(date)" 1996 2014 "$url" >>listing.txt
done <"$additional_url"
# Construct listing2.txt
# Remove the wildcard url and fetch all the versions from index
# Lines only contains the URL starting with "/web/YYYYMMDDHHMMSS/" (only)
# It may contains duplicates
: >listing2.txt
while read line; do
if [[ -z "$line" ]]; then continue; fi
#printf "%s\n" "$line"
oldifs="$IFS"
IFS="$tab" elems=($line)
IFS="$oldifs"
lastcap="${elems[0]}"
firstyear="${elems[1]}"
lastyear="${elems[2]}"
mainurl="${elems[3]}"
#echo "Main URL: $firstyear->$lastyear $mainurl"
if [[ $mainurl =~ '/web/*/' ]]; then
listing="./$mainurl.txt"
mkdir -p "$(dirname "$listing")"
: >"$listing"
oldlastcap="$(cat "./$mainurl.lastcap.txt" 2>/dev/null)"
oldlastyear="$(cat "./$mainurl.lastyear.txt" 2>/dev/null)"
: ${oldlastyear:=$lastyear}
for y in $(seq $firstyear $lastyear); do
u="/web/${y}0101000000*/${mainurl#/web/*/}"
mkdir -p "$(dirname "./$u.html")"
if ! [[ -s "./$u.html" ]] || ([[ $y -ge $oldlastyear ]] && [[ $lastcap != $oldlastcap ]]) ; then
$wget "$webarchive$u" -O "./$u.html"
fi
#<"./$u.html" egrep 'href="/web/[0-9]+\*' | sed -r 's/.*href="([^"]*)".*/\1/' >"$d/$f.txt"
<"./$u.html" egrep 'href="/web/[0-9]*/' | sed -r 's/.*href="([^"]*)".*/\1/' >>"$listing"
done
printf %s "$lastcap" >"./$mainurl.lastcap.txt"
printf %s "$lastyear" >"./$mainurl.lastyear.txt"
<"$listing" | sort | uniq >>listing2.txt
else
echo "$mainurl" >>listing2.txt
fi
done <listing.txt
# Construct listing3.txt
# sort, uniq, use unmodified page appending id_ to the timestamp
# URL must start with "/web/YYYYMMDDHHMMSSid_/" only.
# This is the list of files that needs to be downloaded (if not already present)
<listing2.txt sort | uniq | sed -r 's:^/web/([0-9]*)/:/web/\1id_/:' >listing3.txt
# Download listing3
while read url; do
if [[ $url != ${url%/} ]]; then
f="./$url/.index"
else
f="./$url"
fi
mkdir -p "$(dirname "$f")"
if ! [[ -s "$f" ]]; then
$wget "$webarchive$url" -O "./$f"
fi
done <listing3.txt
@OlegKorn
Copy link

OlegKorn commented Nov 4, 2025

you mean this is for d-ding some pdf books, onlt available for borrowing? or what?

@rokke-git
Copy link

rokke-git commented Nov 4, 2025

you mean this is for d-ding some pdf books, onlt available for borrowing? or what?

this is for downloading webpages saved by the internet archive. some websites have pages which are just a pdf, the second oneliner in my most recent comment filters all the pages the internet archive has saved under the url down to just the pages which are pdfs

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment