-
-
Save mildred/7a33bb9c263f025b59e6 to your computer and use it in GitHub Desktop.
| #!/bin/bash | |
| url=http://redefininggod.com | |
| webarchive=https://web.archive.org | |
| wget="wget -e robots=off -nv" | |
| tab="$(printf '\t')" | |
| additional_url=url.list | |
| # Construct listing.txt from url.list | |
| # The list of archived pages, including some wildcard url | |
| # each line contains some fields separated by tabs: | |
| # - the last capture date (opaque format, if different, the last year index file | |
| # will be redownloaded) | |
| # - the first capture year (hint for which is the oldest index to query) | |
| # - the last capture year (hint for which is the latest index to query) | |
| # - the url, starting with "/web/YYYYMMDDHHMMSS/" or "/web/*" (only) | |
| : >listing.txt | |
| # Add url.list to listing.txt | |
| while read url; do | |
| if [[ -z "$url" ]]; then continue; fi | |
| if [[ $url != ${url#http*/web.archive.org} ]]; then | |
| url="${url#http*/web.archive.org}" | |
| elif [[ $url != ${url%/\*} ]]; then | |
| mkdir -p "$(dirname "./web/*/$url")" | |
| $wget "$webarchive/web/*/$url" -O "./web/*/$url.html" | |
| # <listing.html fgrep 'href="/web/' | cut -d'"' -f2 >listing.txt | |
| <"./web/*/$url.html" sed -r -e ' | |
| /<table id="resultsUrl">/,/<\/table>/ { | |
| /a href/ { | |
| s/.*href="(.*)".*/\1/; | |
| h | |
| }; | |
| /dateFrom/ { | |
| s/.*([0-9]{4})<\/td>.*/\1/; | |
| x; | |
| H | |
| }; | |
| /dateTo/ { | |
| s/.*>(.*)([0-9]{4})<\/td>.*/\1\2\n\2/; | |
| x; | |
| H | |
| }; | |
| /<\/tr>/ { | |
| x; | |
| s/(.*)\n(.*)\n(.*)\n(.*)/\1\t\3\t\2\t\4/; | |
| p | |
| } | |
| }; | |
| d' >"./web/*/$url.txt" | |
| cat "./web/*/$url.txt" >>listing.txt | |
| continue | |
| else | |
| url="/web/*/$url" | |
| fi | |
| printf "%s\t%s\t%s\t%s\n" "$(date)" 1996 2014 "$url" >>listing.txt | |
| done <"$additional_url" | |
| # Construct listing2.txt | |
| # Remove the wildcard url and fetch all the versions from index | |
| # Lines only contains the URL starting with "/web/YYYYMMDDHHMMSS/" (only) | |
| # It may contains duplicates | |
| : >listing2.txt | |
| while read line; do | |
| if [[ -z "$line" ]]; then continue; fi | |
| #printf "%s\n" "$line" | |
| oldifs="$IFS" | |
| IFS="$tab" elems=($line) | |
| IFS="$oldifs" | |
| lastcap="${elems[0]}" | |
| firstyear="${elems[1]}" | |
| lastyear="${elems[2]}" | |
| mainurl="${elems[3]}" | |
| #echo "Main URL: $firstyear->$lastyear $mainurl" | |
| if [[ $mainurl =~ '/web/*/' ]]; then | |
| listing="./$mainurl.txt" | |
| mkdir -p "$(dirname "$listing")" | |
| : >"$listing" | |
| oldlastcap="$(cat "./$mainurl.lastcap.txt" 2>/dev/null)" | |
| oldlastyear="$(cat "./$mainurl.lastyear.txt" 2>/dev/null)" | |
| : ${oldlastyear:=$lastyear} | |
| for y in $(seq $firstyear $lastyear); do | |
| u="/web/${y}0101000000*/${mainurl#/web/*/}" | |
| mkdir -p "$(dirname "./$u.html")" | |
| if ! [[ -s "./$u.html" ]] || ([[ $y -ge $oldlastyear ]] && [[ $lastcap != $oldlastcap ]]) ; then | |
| $wget "$webarchive$u" -O "./$u.html" | |
| fi | |
| #<"./$u.html" egrep 'href="/web/[0-9]+\*' | sed -r 's/.*href="([^"]*)".*/\1/' >"$d/$f.txt" | |
| <"./$u.html" egrep 'href="/web/[0-9]*/' | sed -r 's/.*href="([^"]*)".*/\1/' >>"$listing" | |
| done | |
| printf %s "$lastcap" >"./$mainurl.lastcap.txt" | |
| printf %s "$lastyear" >"./$mainurl.lastyear.txt" | |
| <"$listing" | sort | uniq >>listing2.txt | |
| else | |
| echo "$mainurl" >>listing2.txt | |
| fi | |
| done <listing.txt | |
| # Construct listing3.txt | |
| # sort, uniq, use unmodified page appending id_ to the timestamp | |
| # URL must start with "/web/YYYYMMDDHHMMSSid_/" only. | |
| # This is the list of files that needs to be downloaded (if not already present) | |
| <listing2.txt sort | uniq | sed -r 's:^/web/([0-9]*)/:/web/\1id_/:' >listing3.txt | |
| # Download listing3 | |
| while read url; do | |
| if [[ $url != ${url%/} ]]; then | |
| f="./$url/.index" | |
| else | |
| f="./$url" | |
| fi | |
| mkdir -p "$(dirname "$f")" | |
| if ! [[ -s "$f" ]]; then | |
| $wget "$webarchive$url" -O "./$f" | |
| fi | |
| done <listing3.txt |
Didn't managed to get it to work, ended up coding a small gem in Ruby: https://github.com/hartator/wayback-machine-downloader
./download.sh: line 62: url.list: No such file or directory
For those having problems like "./download.sh: line 62: url.list: No such file or directory" go to lines 21 and 62 and comment them out by placing a # character in front of them. Make sure you edit line 3 to point to your domain before you run the command.
However, I think this only manages to pull the indexes (calendars) not the actual archived pages.
you can do all this in just a few lines by using the api:
#!/bin/bash
url= #your url
wget "http://web.archive.org/cdx/search/cdx?url=${url}*&output=json&fl=original,timestamp" -O out.json
sed -Eni '2,$s%^\["([^"]*)","([^"]*)"](,|])$%wget "https://web.archive.org/web/\2id_/\1" -O \2.html%gmp' out.json
cat out.json | shif you don't care about the filenames being dumb, wget will reuse the same connection for all the urls in a list, making it faster:
#!/bin/bash
url= #your url
wget "http://web.archive.org/cdx/search/cdx?url=${url}*&output=json&fl=original,timestamp" -O out.json
sed -Eni '2,$s%^\["([^"]*)","([^"]*)"](,|])$%https://web.archive.org/web/\2id_/\1%gmp' out.json
wget -i out.jsonyou can even filter the results and such, see the docs
@rokke-git THANK YOU! your scripts have been life savers. I wanted to archive a ton of PDF files to my computer. I was trying to use the API before, but I did not realize until I read your scripts that the asterisk * was necessary. I will attach my modified scripts below. Also, I did not experience the 'filenames being dumb' thing. Maybe it was just on your end.
#!/bin/bash
# Download files from an archived website from http://web.archive.org
# Will only use 1 connection.
url="screentalk.org"
# API docs
# https://archive.org/developers/wayback-cdx-server.html
# only grab PDF files
filter="&filter=mimetype:application/pdf"
#
# the asterisk '*' is very important!!!!
# otherwise only the 'base' url will be returned and you will
# not find any files!
wget "http://web.archive.org/cdx/search/cdx?url=${url}*${filter}&output=json&fl=original,timestamp" -O out.json
sed -Eni '2,$s%^\["([^"]*)","([^"]*)"](,|])$%https://web.archive.org/web/\2id_/\1%gmp' out.json
wget -i out.json
#!/bin/bash
# Download files from an archived website from http://web.archive.org
# Will not reuse the previous connection, may get blocked!
# If you would like to reuse a single connection, you must tell
# wget to read from a single file.
url="screentalk.org"
wget "http://web.archive.org/cdx/search/cdx?url=${url}*&output=json&fl=original,timestamp,mimetype" -O out.json
grep 'application/pdf' < out.json | sed "s/.\{0,20\}$//; /^$/d;s/$/],/" > out2.json
sed -En '1,$s%^\["([^"]*)","([^"]*)"](,|])$%wget "https://web.archive.org/web/\2id_/\1" -O \2.pdf%gmp' <out2.json >out3.json
cat out3.json | sh
@TrevCan wow, totally forgot about this. glancing at this again all three lines can be simplified; here's a oneliner which doesn't make some random out.json:
wget "http://web.archive.org/cdx/search/cdx?url=${url}*&fl=timestamp,original" -O- | \
sed 's%^%https://web.archive.org/web/%;s% %id_/%' | wget -i-also, you can do your filtering inside the same sed call in various ways, for example:
wget "http://web.archive.org/cdx/search/cdx?url=${url}*&fl=timestamp,original,mimetype" -O- | \
sed -n 's%^%https://web.archive.org/web/%;s% %id_/%;s% application/pdf%%p' | wget -i-you mean this is for d-ding some pdf books, onlt available for borrowing? or what?
you mean this is for d-ding some pdf books, onlt available for borrowing? or what?
this is for downloading webpages saved by the internet archive. some websites have pages which are just a pdf, the second oneliner in my most recent comment filters all the pages the internet archive has saved under the url down to just the pages which are pdfs
C:\Users\COMPUTER\Desktop\566\download.sh: syntax error near unexpected token
e lems=($' C:\Users\COMPUTER\Desktop\566\download.sh: C:\Users\COMPUTER\Desktop\566\downloa d.sh: line 77:IFS="$tab" elems=($line)'bash$ cd
bash$