Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save grmpfhmbl/d82d50ea54fcddbd936d to your computer and use it in GitHub Desktop.
Save grmpfhmbl/d82d50ea54fcddbd936d to your computer and use it in GitHub Desktop.

Revisions

  1. grmpfhmbl revised this gist Jan 18, 2016. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions scrape-apache-directory-index.sh
    Original file line number Diff line number Diff line change
    @@ -44,8 +44,8 @@ fetch()
    done

    for DIR in $DIRS; do
    echo "fetch $1$DIR"
    fetch $1$DIR
    echo "fetch $1/$DIR"
    fetch $1/$DIR
    done

    rm -f $INDEX_FILE
  2. grmpfhmbl revised this gist Jan 18, 2016. 1 changed file with 10 additions and 5 deletions.
    15 changes: 10 additions & 5 deletions scrape-apache-directory-index.sh
    Original file line number Diff line number Diff line change
    @@ -16,22 +16,26 @@ fetch()
    INDEX_URI=$BASE_URI$1
    DIR=`basename $1`

    echo "INDEX_URI=$INDEX_URI"

    if [ ! -d $DIR ]; then
    mkdir $DIR
    fi

    pushd $DIR > /dev/null
    pushd $DIR

    echo "INFO: Downloading $INDEX_URI"
    echo "curl -o $INDEX_FILE -s -L $INDEX_URI"
    curl -o $INDEX_FILE -s -L $INDEX_URI

    if [ $? -eq 0 ]; then
    DIRS=`grep '\[DIR\]' $INDEX_FILE | grep -v 'Parent Directory' | sed -e 's/.*href="\([^"]*\).*/\1/g'`
    TXTS=`grep '\[TXT\]' $INDEX_FILE | sed -e 's/.*href="\([^"]*\).*/\1/g'`
    IMGS=`grep '\[IMG\]' $INDEX_FILE | sed -e 's/.*href="\([^"]*\).*/\1/g'`
    UNKNOWNS=`grep '\[ \]' $INDEX_FILE | sed -e 's/.*href="\([^"]*\).*/\1/g'`
    DIRS=`grep '\[DIR\]' $INDEX_FILE | grep -v 'Parent Directory' | sed -e 's/.*href="\([^"]*\).*/\1/g'`

    for FILE in $TXTS $UNKNOWNS; do
    FILE_URI=$BASE_URI$FILE
    for FILE in $TXTS $UNKNOWNS $IMGS; do
    FILE_URI=$INDEX_URI$FILE
    echo "INFO: Downloading $FILE_URI"
    curl -O -s -L -R $FILE_URI
    if [ $? -ne 0 ]; then
    @@ -40,7 +44,8 @@ fetch()
    done

    for DIR in $DIRS; do
    fetch $DIR
    echo "fetch $1$DIR"
    fetch $1$DIR
    done

    rm -f $INDEX_FILE
  3. @kaorimatz kaorimatz revised this gist Jun 17, 2013. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions scrape-apache-directory-index.sh
    Original file line number Diff line number Diff line change
    @@ -27,10 +27,10 @@ fetch()

    if [ $? -eq 0 ]; then
    TXTS=`grep '\[TXT\]' $INDEX_FILE | sed -e 's/.*href="\([^"]*\).*/\1/g'`
    UNKOWNS=`grep '\[ \]' $INDEX_FILE | sed -e 's/.*href="\([^"]*\).*/\1/g'`
    UNKNOWNS=`grep '\[ \]' $INDEX_FILE | sed -e 's/.*href="\([^"]*\).*/\1/g'`
    DIRS=`grep '\[DIR\]' $INDEX_FILE | grep -v 'Parent Directory' | sed -e 's/.*href="\([^"]*\).*/\1/g'`

    for FILE in $TXTS $UNKOWNS; do
    for FILE in $TXTS $UNKNOWNS; do
    FILE_URI=$BASE_URI$FILE
    echo "INFO: Downloading $FILE_URI"
    curl -O -s -L -R $FILE_URI
  4. @kaorimatz kaorimatz renamed this gist Jun 17, 2013. 1 changed file with 0 additions and 0 deletions.
  5. @kaorimatz kaorimatz created this gist Jun 17, 2013.
    54 changes: 54 additions & 0 deletions scrape-apache-directory-indexes.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,54 @@
    #!/bin/bash

    set -e

    if [ $# -ne 2 ]; then
    echo "usage: $0 <base_uri> <target_dir_path>"
    exit 1;
    fi

    BASE_URI=$1
    TARGET_DIR_PATH=$2
    INDEX_FILE='index.html'

    fetch()
    {
    INDEX_URI=$BASE_URI$1
    DIR=`basename $1`

    if [ ! -d $DIR ]; then
    mkdir $DIR
    fi

    pushd $DIR > /dev/null

    echo "INFO: Downloading $INDEX_URI"
    curl -o $INDEX_FILE -s -L $INDEX_URI

    if [ $? -eq 0 ]; then
    TXTS=`grep '\[TXT\]' $INDEX_FILE | sed -e 's/.*href="\([^"]*\).*/\1/g'`
    UNKOWNS=`grep '\[ \]' $INDEX_FILE | sed -e 's/.*href="\([^"]*\).*/\1/g'`
    DIRS=`grep '\[DIR\]' $INDEX_FILE | grep -v 'Parent Directory' | sed -e 's/.*href="\([^"]*\).*/\1/g'`

    for FILE in $TXTS $UNKOWNS; do
    FILE_URI=$BASE_URI$FILE
    echo "INFO: Downloading $FILE_URI"
    curl -O -s -L -R $FILE_URI
    if [ $? -ne 0 ]; then
    echo "WARN: Failed to download: $FILE_URI" 1>&2
    fi
    done

    for DIR in $DIRS; do
    fetch $DIR
    done

    rm -f $INDEX_FILE
    else
    echo "WARN: Failed to download directory index: $INDEX_URI" 1>&2
    fi

    popd > /dev/null
    }

    fetch $TARGET_DIR_PATH