lucasvscn · May 25, 2024 19:05
diff --git a/archive-dl.sh b/archive-dl.sh
 #!/usr/bin/env bash
 # vim: set ts=2 sw=2 sts=2 et:
 #
 # ----------------------------------------------------------------------------
 #
 # archive-dl.sh - Download files from archive.org
 #
 # Author : Lucas Vasconcelos <[email protected]>
 # Date ..: 2024-05-25
 # URL ...: https://gist.github.com/lucasvscn/a8e8dcc4908a92c01dd0af0bed54219a
 #
 # ----------------------------------------------------------------------------
 #
 # This script scrapes the archive.org from a given BASE_URL and downloads all
 # files from it. It uses a combination of `curl`, `grep` and `sed` to extract
 # the links from the page.
 #
 # The script will save the files in the given directory. If the directory
 # doesn't exist, it will be created. If no directory is given, the files will
 # be saved in the current directory.
 #
 # Usage:
 #   $ archive-dl.sh [BASE_PATH] [DEST]
 #
 # Examples:
 #
 #   $ archive-dl.sh "cylums-neo-geo-rom-collection" /path/to/save
 #
 #
 # ----------------------------------------------------------------------------
 #
 # MIT License
 #

 # ---[ Configuration ]--------------------------------------------------------

 # Archive.org URL
 URL="https://archive.org/download"
 DEBUG="false"
 MAX_CONCURRENT_DOWNLOADS=5

 # ---[ Functions ]------------------------------------------------------------

 # Print the usage message
 usage() {
  echo "Usage: $0 [BASE_PATH] [DEST]"
  echo
  echo "Examples:"
  echo "  $0 \"cylums-neo-geo-rom-collection\" /path/to/save"
  echo
 }

 # Print verbose/debug messages
 debug() {
  if [ "$DEBUG" = "true" ]; then
    echo "DEBUG: $*"
  fi
 }

 # Download the HTML page from the given URL and store it in a file named
 # `index.html` in the DEST directory.
 download_page() {
  local URL=$1
  local DEST=$2

  # Download the page
  debug "Downloading page: $URL"
  curl -s $URL -o "$DEST/index.html"
 }

 # Extract the links from the HTML page and store them in a file named
 # `links.txt` in the DEST directory.
 extract_links() {
  local DEST=$1

  # First we use sed to extract the contents of a table identified by the
  # class `directory-listing-table`. Then we use grep to extract the links
  # and store them in a file named `links.txt`.
  sed -n '/<table class="directory-listing-table">/,/<\/table>/p' "$DEST/index.html" | \
    grep -oP 'href="\K[^"]+' > "$DEST/links.txt"
 }

 # Clean the links file by removing the first line (the parent directory link)
 clean_links() {
  local DEST=$1

  # Remove the first line from the file
  sed -i '1d' "$DEST/links.txt"
 }

 # Determine if a link is a directory or a file
 is_directory() {
  local link=$1

  # Check if the link ends with a `/`
  [[ $link == */ ]]
 }

 # Determine is a link is a directory listing the contents of a zip file.
 is_zip_directory() {
  local link=$1

  # Check if the link ends with a `.zip/`
  [[ $link == *.zip/ ]]
 }

 # Remove URL encoding caracteres from the link
 urldecode() {
  local encoded="${*//+/ }"
  printf '%b' "${encoded//%/\\x}"
 }

 # Remove trailing slashes from a given path.
 remove_trailing_slash() {
    local input="$1"
    # Remove trailing slash if exists
    if [ "${input: -1}" = "/" ]; then
        input="${input%/}"
    fi
    echo "$input"
 }

 # Download the files from the links file
 download_files() {
  local DEST=$(remove_trailing_slash "$1")

  # Iterate over the links and download the files
  while read -r link; do
    # Decode the URL
    filename=$(urldecode $link)

    # Download the file.
    # If the link is a zip directory, we should ignore it.
    # Otherwise, we should check if it's a directory. If so, we should
    # create the directory and call the script recursively.
    # At last, we should download the file.
    if is_zip_directory $link; then
      debug "Ignoring zip directory: $filename"
    elif is_directory $link; then
      # Create the directory if it doesn't exist
      debug "Creating directory: $DEST/$filename"
      mkdir -p "$DEST/$filename"

      # Call the script recursively
      debug "Calling script recursively:\
      $0 $(remove_trailing_slash "$URL_PATH")/$link $DEST/$filename"

      $0 "$(remove_trailing_slash "$URL_PATH")/$link" "$DEST/$filename"
    else
      # Download the file in the background and limit the number of
      # concurrent downloads to MAX_CONCURRENT_DOWNLOADS
      while [ $(jobs | wc -l) -ge $MAX_CONCURRENT_DOWNLOADS ]; do
        sleep 1
      done

      local URL_PATH=$(remove_trailing_slash "$URL_PATH")
      debug "Downloading file: $filename from $URL/$URL_PATH/$link"
      curl -s -L -C - "$URL/$URL_PATH/$link" -o "$DEST/$filename" &
    fi
  done < "$DEST/links.txt"
 }

 # ---[ Main ]-----------------------------------------------------------------

 # Check if the script has at least one argument
 if [ $# -lt 1 ]; then
  usage
  exit 1
 fi

 # Get the PATH and DEST from the arguments
 URL_PATH=$1
 DEST=${2:-.}

 # Create the directory if it doesn't exist
 mkdir -p "$DEST"

 # Download the page
 download_page "$URL/$URL_PATH" "$DEST"

 # Extract the links
 extract_links "$DEST"

 # Clean the links
 clean_links "$DEST"

 # Download the files
 download_files "$DEST"
	#!/usr/bin/env bash
	# vim: set ts=2 sw=2 sts=2 et:
	#
	# ----------------------------------------------------------------------------
	#
	# archive-dl.sh - Download files from archive.org
	#
	# Author : Lucas Vasconcelos <[email protected]>
	# Date ..: 2024-05-25
	# URL ...: https://gist.github.com/lucasvscn/a8e8dcc4908a92c01dd0af0bed54219a
	#
	# ----------------------------------------------------------------------------
	#
	# This script scrapes the archive.org from a given BASE_URL and downloads all
	# files from it. It uses a combination of `curl`, `grep` and `sed` to extract
	# the links from the page.
	#
	# The script will save the files in the given directory. If the directory
	# doesn't exist, it will be created. If no directory is given, the files will
	# be saved in the current directory.
	#
	# Usage:
	# $ archive-dl.sh [BASE_PATH] [DEST]
	#
	# Examples:
	#
	# $ archive-dl.sh "cylums-neo-geo-rom-collection" /path/to/save
	#
	#
	# ----------------------------------------------------------------------------
	#
	# MIT License
	#

	# ---[ Configuration ]--------------------------------------------------------

	# Archive.org URL
	URL="https://archive.org/download"
	DEBUG="false"
	MAX_CONCURRENT_DOWNLOADS=5

	# ---[ Functions ]------------------------------------------------------------

	# Print the usage message
	usage() {
	echo "Usage: $0 [BASE_PATH] [DEST]"
	echo
	echo "Examples:"
	echo " $0 \"cylums-neo-geo-rom-collection\" /path/to/save"
	echo
	}

	# Print verbose/debug messages
	debug() {
	if [ "$DEBUG" = "true" ]; then
	echo "DEBUG: $*"
	fi
	}

	# Download the HTML page from the given URL and store it in a file named
	# `index.html` in the DEST directory.
	download_page() {
	local URL=$1
	local DEST=$2

	# Download the page
	debug "Downloading page: $URL"
	curl -s $URL -o "$DEST/index.html"
	}

	# Extract the links from the HTML page and store them in a file named
	# `links.txt` in the DEST directory.
	extract_links() {
	local DEST=$1

	# First we use sed to extract the contents of a table identified by the
	# class `directory-listing-table`. Then we use grep to extract the links
	# and store them in a file named `links.txt`.
	sed -n '/<table class="directory-listing-table">/,/<\/table>/p' "$DEST/index.html" \| \
	grep -oP 'href="\K[^"]+' > "$DEST/links.txt"
	}

	# Clean the links file by removing the first line (the parent directory link)
	clean_links() {
	local DEST=$1

	# Remove the first line from the file
	sed -i '1d' "$DEST/links.txt"
	}

	# Determine if a link is a directory or a file
	is_directory() {
	local link=$1

	# Check if the link ends with a `/`
	[[ $link == */ ]]
	}

	# Determine is a link is a directory listing the contents of a zip file.
	is_zip_directory() {
	local link=$1

	# Check if the link ends with a `.zip/`
	[[ $link == *.zip/ ]]
	}

	# Remove URL encoding caracteres from the link
	urldecode() {
	local encoded="${*//+/ }"
	printf '%b' "${encoded//%/\\x}"
	}

	# Remove trailing slashes from a given path.
	remove_trailing_slash() {
	local input="$1"
	# Remove trailing slash if exists
	if [ "${input: -1}" = "/" ]; then
	input="${input%/}"
	fi
	echo "$input"
	}

	# Download the files from the links file
	download_files() {
	local DEST=$(remove_trailing_slash "$1")

	# Iterate over the links and download the files
	while read -r link; do
	# Decode the URL
	filename=$(urldecode $link)

	# Download the file.
	# If the link is a zip directory, we should ignore it.
	# Otherwise, we should check if it's a directory. If so, we should
	# create the directory and call the script recursively.
	# At last, we should download the file.
	if is_zip_directory $link; then
	debug "Ignoring zip directory: $filename"
	elif is_directory $link; then
	# Create the directory if it doesn't exist
	debug "Creating directory: $DEST/$filename"
	mkdir -p "$DEST/$filename"

	# Call the script recursively
	debug "Calling script recursively:\
	$0 $(remove_trailing_slash "$URL_PATH")/$link $DEST/$filename"

	$0 "$(remove_trailing_slash "$URL_PATH")/$link" "$DEST/$filename"
	else
	# Download the file in the background and limit the number of
	# concurrent downloads to MAX_CONCURRENT_DOWNLOADS
	while [ $(jobs \| wc -l) -ge $MAX_CONCURRENT_DOWNLOADS ]; do
	sleep 1
	done

	local URL_PATH=$(remove_trailing_slash "$URL_PATH")
	debug "Downloading file: $filename from $URL/$URL_PATH/$link"
	curl -s -L -C - "$URL/$URL_PATH/$link" -o "$DEST/$filename" &
	fi
	done < "$DEST/links.txt"
	}

	# ---[ Main ]-----------------------------------------------------------------

	# Check if the script has at least one argument
	if [ $# -lt 1 ]; then
	usage
	exit 1
	fi

	# Get the PATH and DEST from the arguments
	URL_PATH=$1
	DEST=${2:-.}

	# Create the directory if it doesn't exist
	mkdir -p "$DEST"

	# Download the page
	download_page "$URL/$URL_PATH" "$DEST"

	# Extract the links
	extract_links "$DEST"

	# Clean the links
	clean_links "$DEST"

	# Download the files
	download_files "$DEST"
No results found