Skip to content

Instantly share code, notes, and snippets.

@lucasvscn
Last active May 25, 2024 19:05
Show Gist options
  • Select an option

  • Save lucasvscn/a8e8dcc4908a92c01dd0af0bed54219a to your computer and use it in GitHub Desktop.

Select an option

Save lucasvscn/a8e8dcc4908a92c01dd0af0bed54219a to your computer and use it in GitHub Desktop.
Download files from archive.org
#!/usr/bin/env bash
# vim: set ts=2 sw=2 sts=2 et:
#
# ----------------------------------------------------------------------------
#
# archive-dl.sh - Download files from archive.org
#
# Author : Lucas Vasconcelos <[email protected]>
# Date ..: 2024-05-25
# URL ...: https://gist.github.com/lucasvscn/a8e8dcc4908a92c01dd0af0bed54219a
#
# ----------------------------------------------------------------------------
#
# This script scrapes the archive.org from a given BASE_URL and downloads all
# files from it. It uses a combination of `curl`, `grep` and `sed` to extract
# the links from the page.
#
# The script will save the files in the given directory. If the directory
# doesn't exist, it will be created. If no directory is given, the files will
# be saved in the current directory.
#
# Usage:
# $ archive-dl.sh [BASE_PATH] [DEST]
#
# Examples:
#
# $ archive-dl.sh "cylums-neo-geo-rom-collection" /path/to/save
#
#
# ----------------------------------------------------------------------------
#
# MIT License
#
# ---[ Configuration ]--------------------------------------------------------
# Archive.org URL
URL="https://archive.org/download"
DEBUG="false"
MAX_CONCURRENT_DOWNLOADS=5
# ---[ Functions ]------------------------------------------------------------
# Print the usage message
usage() {
echo "Usage: $0 [BASE_PATH] [DEST]"
echo
echo "Examples:"
echo " $0 \"cylums-neo-geo-rom-collection\" /path/to/save"
echo
}
# Print verbose/debug messages
debug() {
if [ "$DEBUG" = "true" ]; then
echo "DEBUG: $*"
fi
}
# Download the HTML page from the given URL and store it in a file named
# `index.html` in the DEST directory.
download_page() {
local URL=$1
local DEST=$2
# Download the page
debug "Downloading page: $URL"
curl -s $URL -o "$DEST/index.html"
}
# Extract the links from the HTML page and store them in a file named
# `links.txt` in the DEST directory.
extract_links() {
local DEST=$1
# First we use sed to extract the contents of a table identified by the
# class `directory-listing-table`. Then we use grep to extract the links
# and store them in a file named `links.txt`.
sed -n '/<table class="directory-listing-table">/,/<\/table>/p' "$DEST/index.html" | \
grep -oP 'href="\K[^"]+' > "$DEST/links.txt"
}
# Clean the links file by removing the first line (the parent directory link)
clean_links() {
local DEST=$1
# Remove the first line from the file
sed -i '1d' "$DEST/links.txt"
}
# Determine if a link is a directory or a file
is_directory() {
local link=$1
# Check if the link ends with a `/`
[[ $link == */ ]]
}
# Determine is a link is a directory listing the contents of a zip file.
is_zip_directory() {
local link=$1
# Check if the link ends with a `.zip/`
[[ $link == *.zip/ ]]
}
# Remove URL encoding caracteres from the link
urldecode() {
local encoded="${*//+/ }"
printf '%b' "${encoded//%/\\x}"
}
# Remove trailing slashes from a given path.
remove_trailing_slash() {
local input="$1"
# Remove trailing slash if exists
if [ "${input: -1}" = "/" ]; then
input="${input%/}"
fi
echo "$input"
}
# Download the files from the links file
download_files() {
local DEST=$(remove_trailing_slash "$1")
# Iterate over the links and download the files
while read -r link; do
# Decode the URL
filename=$(urldecode $link)
# Download the file.
# If the link is a zip directory, we should ignore it.
# Otherwise, we should check if it's a directory. If so, we should
# create the directory and call the script recursively.
# At last, we should download the file.
if is_zip_directory $link; then
debug "Ignoring zip directory: $filename"
elif is_directory $link; then
# Create the directory if it doesn't exist
debug "Creating directory: $DEST/$filename"
mkdir -p "$DEST/$filename"
# Call the script recursively
debug "Calling script recursively:\
$0 $(remove_trailing_slash "$URL_PATH")/$link $DEST/$filename"
$0 "$(remove_trailing_slash "$URL_PATH")/$link" "$DEST/$filename"
else
# Download the file in the background and limit the number of
# concurrent downloads to MAX_CONCURRENT_DOWNLOADS
while [ $(jobs | wc -l) -ge $MAX_CONCURRENT_DOWNLOADS ]; do
sleep 1
done
local URL_PATH=$(remove_trailing_slash "$URL_PATH")
debug "Downloading file: $filename from $URL/$URL_PATH/$link"
curl -s -L -C - "$URL/$URL_PATH/$link" -o "$DEST/$filename" &
fi
done < "$DEST/links.txt"
}
# ---[ Main ]-----------------------------------------------------------------
# Check if the script has at least one argument
if [ $# -lt 1 ]; then
usage
exit 1
fi
# Get the PATH and DEST from the arguments
URL_PATH=$1
DEST=${2:-.}
# Create the directory if it doesn't exist
mkdir -p "$DEST"
# Download the page
download_page "$URL/$URL_PATH" "$DEST"
# Extract the links
extract_links "$DEST"
# Clean the links
clean_links "$DEST"
# Download the files
download_files "$DEST"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment