Last active
May 25, 2024 19:05
-
-
Save lucasvscn/a8e8dcc4908a92c01dd0af0bed54219a to your computer and use it in GitHub Desktop.
Download files from archive.org
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # vim: set ts=2 sw=2 sts=2 et: | |
| # | |
| # ---------------------------------------------------------------------------- | |
| # | |
| # archive-dl.sh - Download files from archive.org | |
| # | |
| # Author : Lucas Vasconcelos <[email protected]> | |
| # Date ..: 2024-05-25 | |
| # URL ...: https://gist.github.com/lucasvscn/a8e8dcc4908a92c01dd0af0bed54219a | |
| # | |
| # ---------------------------------------------------------------------------- | |
| # | |
| # This script scrapes the archive.org from a given BASE_URL and downloads all | |
| # files from it. It uses a combination of `curl`, `grep` and `sed` to extract | |
| # the links from the page. | |
| # | |
| # The script will save the files in the given directory. If the directory | |
| # doesn't exist, it will be created. If no directory is given, the files will | |
| # be saved in the current directory. | |
| # | |
| # Usage: | |
| # $ archive-dl.sh [BASE_PATH] [DEST] | |
| # | |
| # Examples: | |
| # | |
| # $ archive-dl.sh "cylums-neo-geo-rom-collection" /path/to/save | |
| # | |
| # | |
| # ---------------------------------------------------------------------------- | |
| # | |
| # MIT License | |
| # | |
| # ---[ Configuration ]-------------------------------------------------------- | |
| # Archive.org URL | |
| URL="https://archive.org/download" | |
| DEBUG="false" | |
| MAX_CONCURRENT_DOWNLOADS=5 | |
| # ---[ Functions ]------------------------------------------------------------ | |
| # Print the usage message | |
| usage() { | |
| echo "Usage: $0 [BASE_PATH] [DEST]" | |
| echo | |
| echo "Examples:" | |
| echo " $0 \"cylums-neo-geo-rom-collection\" /path/to/save" | |
| echo | |
| } | |
| # Print verbose/debug messages | |
| debug() { | |
| if [ "$DEBUG" = "true" ]; then | |
| echo "DEBUG: $*" | |
| fi | |
| } | |
| # Download the HTML page from the given URL and store it in a file named | |
| # `index.html` in the DEST directory. | |
| download_page() { | |
| local URL=$1 | |
| local DEST=$2 | |
| # Download the page | |
| debug "Downloading page: $URL" | |
| curl -s $URL -o "$DEST/index.html" | |
| } | |
| # Extract the links from the HTML page and store them in a file named | |
| # `links.txt` in the DEST directory. | |
| extract_links() { | |
| local DEST=$1 | |
| # First we use sed to extract the contents of a table identified by the | |
| # class `directory-listing-table`. Then we use grep to extract the links | |
| # and store them in a file named `links.txt`. | |
| sed -n '/<table class="directory-listing-table">/,/<\/table>/p' "$DEST/index.html" | \ | |
| grep -oP 'href="\K[^"]+' > "$DEST/links.txt" | |
| } | |
| # Clean the links file by removing the first line (the parent directory link) | |
| clean_links() { | |
| local DEST=$1 | |
| # Remove the first line from the file | |
| sed -i '1d' "$DEST/links.txt" | |
| } | |
| # Determine if a link is a directory or a file | |
| is_directory() { | |
| local link=$1 | |
| # Check if the link ends with a `/` | |
| [[ $link == */ ]] | |
| } | |
| # Determine is a link is a directory listing the contents of a zip file. | |
| is_zip_directory() { | |
| local link=$1 | |
| # Check if the link ends with a `.zip/` | |
| [[ $link == *.zip/ ]] | |
| } | |
| # Remove URL encoding caracteres from the link | |
| urldecode() { | |
| local encoded="${*//+/ }" | |
| printf '%b' "${encoded//%/\\x}" | |
| } | |
| # Remove trailing slashes from a given path. | |
| remove_trailing_slash() { | |
| local input="$1" | |
| # Remove trailing slash if exists | |
| if [ "${input: -1}" = "/" ]; then | |
| input="${input%/}" | |
| fi | |
| echo "$input" | |
| } | |
| # Download the files from the links file | |
| download_files() { | |
| local DEST=$(remove_trailing_slash "$1") | |
| # Iterate over the links and download the files | |
| while read -r link; do | |
| # Decode the URL | |
| filename=$(urldecode $link) | |
| # Download the file. | |
| # If the link is a zip directory, we should ignore it. | |
| # Otherwise, we should check if it's a directory. If so, we should | |
| # create the directory and call the script recursively. | |
| # At last, we should download the file. | |
| if is_zip_directory $link; then | |
| debug "Ignoring zip directory: $filename" | |
| elif is_directory $link; then | |
| # Create the directory if it doesn't exist | |
| debug "Creating directory: $DEST/$filename" | |
| mkdir -p "$DEST/$filename" | |
| # Call the script recursively | |
| debug "Calling script recursively:\ | |
| $0 $(remove_trailing_slash "$URL_PATH")/$link $DEST/$filename" | |
| $0 "$(remove_trailing_slash "$URL_PATH")/$link" "$DEST/$filename" | |
| else | |
| # Download the file in the background and limit the number of | |
| # concurrent downloads to MAX_CONCURRENT_DOWNLOADS | |
| while [ $(jobs | wc -l) -ge $MAX_CONCURRENT_DOWNLOADS ]; do | |
| sleep 1 | |
| done | |
| local URL_PATH=$(remove_trailing_slash "$URL_PATH") | |
| debug "Downloading file: $filename from $URL/$URL_PATH/$link" | |
| curl -s -L -C - "$URL/$URL_PATH/$link" -o "$DEST/$filename" & | |
| fi | |
| done < "$DEST/links.txt" | |
| } | |
| # ---[ Main ]----------------------------------------------------------------- | |
| # Check if the script has at least one argument | |
| if [ $# -lt 1 ]; then | |
| usage | |
| exit 1 | |
| fi | |
| # Get the PATH and DEST from the arguments | |
| URL_PATH=$1 | |
| DEST=${2:-.} | |
| # Create the directory if it doesn't exist | |
| mkdir -p "$DEST" | |
| # Download the page | |
| download_page "$URL/$URL_PATH" "$DEST" | |
| # Extract the links | |
| extract_links "$DEST" | |
| # Clean the links | |
| clean_links "$DEST" | |
| # Download the files | |
| download_files "$DEST" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment