Skip to content

Instantly share code, notes, and snippets.

@hightemp
Forked from svagionitis/wget-mirror-website.sh
Created February 23, 2024 10:55
Show Gist options
  • Save hightemp/95a866f8907397bd0c7527204d3a455a to your computer and use it in GitHub Desktop.
Save hightemp/95a866f8907397bd0c7527204d3a455a to your computer and use it in GitHub Desktop.

Revisions

  1. @svagionitis svagionitis revised this gist Jan 16, 2021. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions wget-mirror-website.sh
    Original file line number Diff line number Diff line change
    @@ -105,10 +105,10 @@ wget \
    --backup-converted \
    --adjust-extension \
    --page-requisites \
    --wait 3 \
    --wait 30 \
    --random-wait \
    --continue \
    --limit-rate=200k \
    --limit-rate=2k \
    --no-if-modified-since \
    --append-output="${website_host}.log" \
    --rejected-log="${website_host}-rejected.log" \
    @@ -119,4 +119,4 @@ wget \
    # Backuo the directory of mirrored website if backup is enabled
    if [ -n "${backup_dir}" ] ; then
    backup_dir "${website_host}"
    fi
    fi
  2. @svagionitis svagionitis created this gist Jan 16, 2021.
    122 changes: 122 additions & 0 deletions wget-mirror-website.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,122 @@
    #!/bin/sh -eu
    # A script to mirror a website using wget

    usage() {
    cat << EOF
    Usage: $(basename "$0") [-w website] [-u user_agent] [-b]
    Where:
    -w The website to mirror. The website will be like "https://example.com/"
    -u The User agent to use. If no User agent is specified,
    a Firefox default one is used.
    -b Backup the website. If it's not added, it will not backedup.
    -h This help
    EOF
    }

    # Print the error, the usage and exit
    #
    # $1: The error to print
    print_error_exit() {
    error_print="${1}"

    echo "${error_print}"
    usage
    exit 1
    }

    # Backup a directory
    #
    # The format of the backup file will be
    # "Directory name to backup"_Backup-"The size of the directory"-"Current date in format %Y%m%d-%H%M%S".tar.xz
    # For example, if the directory is "example.com" the backup file will look like the following
    # example.com_Backup-12K-20210116-173838.tar.xz
    #
    # $1: The directory to backup
    backup_dir() {
    directory_to_backup="${1}"

    # The time is in UTC/GMT
    current_date=$(date -u +%Y%m%d-%H%M%S)
    # The size of the directory before compressed
    dir_size=$(du -sh "${directory_to_backup}" | cut -f 1)

    filename_of_backup_dir="${directory_to_backup}_Backup-${dir_size}-${current_date}.tar"

    # Create back up tar file of the dir
    tar cvf "${filename_of_backup_dir}" "${directory_to_backup}"

    # Compress tar file
    xz "${filename_of_backup_dir}"
    }

    # Check if wget is present in the system
    IS_WGET_INSTALLED="$(command -v wget)"
    if [ -z "${IS_WGET_INSTALLED}" ] ; then
    print_error_exit "wget command is missing!!!"
    fi

    while getopts "w:u:bh" opt; do
    case "${opt}" in
    w)
    website=${OPTARG}
    ;;
    u)
    user_agent=${OPTARG}
    ;;
    b)
    backup_dir=1
    ;;
    h)
    usage
    exit 0
    ;;
    *)
    usage
    exit 1
    ;;
    esac
    done

    # Check if any options are used
    if [ ${OPTIND} = 1 ] ; then
    print_error_exit "No options specified!"
    fi

    # The website is a mandatory argument
    if [ -z "${website}" ] ; then
    print_error_exit "The [-w website] is required."
    fi

    # The user_agent is a *NOT* mandatory argument
    if [ -z "${user_agent}" ] ; then
    user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
    fi


    website_host=$(echo "$website" | cut -d / -f 3)

    # The main wget command
    wget \
    --debug \
    --mirror \
    --timestamping \
    --convert-links \
    --backup-converted \
    --adjust-extension \
    --page-requisites \
    --wait 3 \
    --random-wait \
    --continue \
    --limit-rate=200k \
    --no-if-modified-since \
    --append-output="${website_host}.log" \
    --rejected-log="${website_host}-rejected.log" \
    --user-agent="${user_agent}" \
    --directory-prefix="${website_host}/" \
    "${website}"

    # Backuo the directory of mirrored website if backup is enabled
    if [ -n "${backup_dir}" ] ; then
    backup_dir "${website_host}"
    fi