-
-
Save mildred/7a33bb9c263f025b59e6 to your computer and use it in GitHub Desktop.
| #!/bin/bash | |
| url=http://redefininggod.com | |
| webarchive=https://web.archive.org | |
| wget="wget -e robots=off -nv" | |
| tab="$(printf '\t')" | |
| additional_url=url.list | |
| # Construct listing.txt from url.list | |
| # The list of archived pages, including some wildcard url | |
| # each line contains some fields separated by tabs: | |
| # - the last capture date (opaque format, if different, the last year index file | |
| # will be redownloaded) | |
| # - the first capture year (hint for which is the oldest index to query) | |
| # - the last capture year (hint for which is the latest index to query) | |
| # - the url, starting with "/web/YYYYMMDDHHMMSS/" or "/web/*" (only) | |
| : >listing.txt | |
| # Add url.list to listing.txt | |
| while read url; do | |
| if [[ -z "$url" ]]; then continue; fi | |
| if [[ $url != ${url#http*/web.archive.org} ]]; then | |
| url="${url#http*/web.archive.org}" | |
| elif [[ $url != ${url%/\*} ]]; then | |
| mkdir -p "$(dirname "./web/*/$url")" | |
| $wget "$webarchive/web/*/$url" -O "./web/*/$url.html" | |
| # <listing.html fgrep 'href="/web/' | cut -d'"' -f2 >listing.txt | |
| <"./web/*/$url.html" sed -r -e ' | |
| /<table id="resultsUrl">/,/<\/table>/ { | |
| /a href/ { | |
| s/.*href="(.*)".*/\1/; | |
| h | |
| }; | |
| /dateFrom/ { | |
| s/.*([0-9]{4})<\/td>.*/\1/; | |
| x; | |
| H | |
| }; | |
| /dateTo/ { | |
| s/.*>(.*)([0-9]{4})<\/td>.*/\1\2\n\2/; | |
| x; | |
| H | |
| }; | |
| /<\/tr>/ { | |
| x; | |
| s/(.*)\n(.*)\n(.*)\n(.*)/\1\t\3\t\2\t\4/; | |
| p | |
| } | |
| }; | |
| d' >"./web/*/$url.txt" | |
| cat "./web/*/$url.txt" >>listing.txt | |
| continue | |
| else | |
| url="/web/*/$url" | |
| fi | |
| printf "%s\t%s\t%s\t%s\n" "$(date)" 1996 2014 "$url" >>listing.txt | |
| done <"$additional_url" | |
| # Construct listing2.txt | |
| # Remove the wildcard url and fetch all the versions from index | |
| # Lines only contains the URL starting with "/web/YYYYMMDDHHMMSS/" (only) | |
| # It may contains duplicates | |
| : >listing2.txt | |
| while read line; do | |
| if [[ -z "$line" ]]; then continue; fi | |
| #printf "%s\n" "$line" | |
| oldifs="$IFS" | |
| IFS="$tab" elems=($line) | |
| IFS="$oldifs" | |
| lastcap="${elems[0]}" | |
| firstyear="${elems[1]}" | |
| lastyear="${elems[2]}" | |
| mainurl="${elems[3]}" | |
| #echo "Main URL: $firstyear->$lastyear $mainurl" | |
| if [[ $mainurl =~ '/web/*/' ]]; then | |
| listing="./$mainurl.txt" | |
| mkdir -p "$(dirname "$listing")" | |
| : >"$listing" | |
| oldlastcap="$(cat "./$mainurl.lastcap.txt" 2>/dev/null)" | |
| oldlastyear="$(cat "./$mainurl.lastyear.txt" 2>/dev/null)" | |
| : ${oldlastyear:=$lastyear} | |
| for y in $(seq $firstyear $lastyear); do | |
| u="/web/${y}0101000000*/${mainurl#/web/*/}" | |
| mkdir -p "$(dirname "./$u.html")" | |
| if ! [[ -s "./$u.html" ]] || ([[ $y -ge $oldlastyear ]] && [[ $lastcap != $oldlastcap ]]) ; then | |
| $wget "$webarchive$u" -O "./$u.html" | |
| fi | |
| #<"./$u.html" egrep 'href="/web/[0-9]+\*' | sed -r 's/.*href="([^"]*)".*/\1/' >"$d/$f.txt" | |
| <"./$u.html" egrep 'href="/web/[0-9]*/' | sed -r 's/.*href="([^"]*)".*/\1/' >>"$listing" | |
| done | |
| printf %s "$lastcap" >"./$mainurl.lastcap.txt" | |
| printf %s "$lastyear" >"./$mainurl.lastyear.txt" | |
| <"$listing" | sort | uniq >>listing2.txt | |
| else | |
| echo "$mainurl" >>listing2.txt | |
| fi | |
| done <listing.txt | |
| # Construct listing3.txt | |
| # sort, uniq, use unmodified page appending id_ to the timestamp | |
| # URL must start with "/web/YYYYMMDDHHMMSSid_/" only. | |
| # This is the list of files that needs to be downloaded (if not already present) | |
| <listing2.txt sort | uniq | sed -r 's:^/web/([0-9]*)/:/web/\1id_/:' >listing3.txt | |
| # Download listing3 | |
| while read url; do | |
| if [[ $url != ${url%/} ]]; then | |
| f="./$url/.index" | |
| else | |
| f="./$url" | |
| fi | |
| mkdir -p "$(dirname "$f")" | |
| if ! [[ -s "$f" ]]; then | |
| $wget "$webarchive$url" -O "./$f" | |
| fi | |
| done <listing3.txt |
@rokke-git THANK YOU! your scripts have been life savers. I wanted to archive a ton of PDF files to my computer. I was trying to use the API before, but I did not realize until I read your scripts that the asterisk * was necessary. I will attach my modified scripts below. Also, I did not experience the 'filenames being dumb' thing. Maybe it was just on your end.
#!/bin/bash
# Download files from an archived website from http://web.archive.org
# Will only use 1 connection.
url="screentalk.org"
# API docs
# https://archive.org/developers/wayback-cdx-server.html
# only grab PDF files
filter="&filter=mimetype:application/pdf"
#
# the asterisk '*' is very important!!!!
# otherwise only the 'base' url will be returned and you will
# not find any files!
wget "http://web.archive.org/cdx/search/cdx?url=${url}*${filter}&output=json&fl=original,timestamp" -O out.json
sed -Eni '2,$s%^\["([^"]*)","([^"]*)"](,|])$%https://web.archive.org/web/\2id_/\1%gmp' out.json
wget -i out.json
#!/bin/bash
# Download files from an archived website from http://web.archive.org
# Will not reuse the previous connection, may get blocked!
# If you would like to reuse a single connection, you must tell
# wget to read from a single file.
url="screentalk.org"
wget "http://web.archive.org/cdx/search/cdx?url=${url}*&output=json&fl=original,timestamp,mimetype" -O out.json
grep 'application/pdf' < out.json | sed "s/.\{0,20\}$//; /^$/d;s/$/],/" > out2.json
sed -En '1,$s%^\["([^"]*)","([^"]*)"](,|])$%wget "https://web.archive.org/web/\2id_/\1" -O \2.pdf%gmp' <out2.json >out3.json
cat out3.json | sh
@TrevCan wow, totally forgot about this. glancing at this again all three lines can be simplified; here's a oneliner which doesn't make some random out.json:
wget "http://web.archive.org/cdx/search/cdx?url=${url}*&fl=timestamp,original" -O- | \
sed 's%^%https://web.archive.org/web/%;s% %id_/%' | wget -i-also, you can do your filtering inside the same sed call in various ways, for example:
wget "http://web.archive.org/cdx/search/cdx?url=${url}*&fl=timestamp,original,mimetype" -O- | \
sed -n 's%^%https://web.archive.org/web/%;s% %id_/%;s% application/pdf%%p' | wget -i-you mean this is for d-ding some pdf books, onlt available for borrowing? or what?
you mean this is for d-ding some pdf books, onlt available for borrowing? or what?
this is for downloading webpages saved by the internet archive. some websites have pages which are just a pdf, the second oneliner in my most recent comment filters all the pages the internet archive has saved under the url down to just the pages which are pdfs
you can do all this in just a few lines by using the api:
if you don't care about the filenames being dumb, wget will reuse the same connection for all the urls in a list, making it faster:
you can even filter the results and such, see the docs