Last active
August 7, 2020 09:14
-
-
Save nop/525d71594b5631c0338f0f2c84b5c26a to your computer and use it in GitHub Desktop.
Pull the complete works of William Shakespeare from shakespeare.mit.edu and convert to plain text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/sh | |
| # | |
| # Pull the complete works of William Shakespeare from shakespeare.mit.edu, | |
| # then convert them from HTML format to plain text with Pandoc. | |
| comedy=("allswell" "asyoulikeit" "comedy_errors" "cymbeline" "lll" "measure" "merry_wives" "merchant" "midsummer" "much_ado" "pericles" "taming_shrew" "tempest" "troilus_cressida" "twelfth_night" "two_gentlemen" "winters_tale") | |
| history=("1henryiv" "2henryiv" "henryv" "1henryvi" "2henryvi" "3henryvi" "henryviii" "john" "richardii" "richardiii") | |
| tragedy=("cleopatra" "coriolanus" "hamlet" "julius_caesar" "lear" "macbeth" "othello" "romeo_juliet" "timon" "titus") | |
| poetry=("LoversComplaint" "RapeOfLucrece" "VenusAndAdonis" "elegy") | |
| sonnets=("I" "II" "III" "IV" "V" "VI" "VII" "VIII" "IX" "X" "XI" "XII" "XIII" "XIV" "XV" "XVI" "XVII" "XVIII" "XIX" "XX" "XXI" "XXII" "XXIII" "XXIV" "XXV" "XXVI" "XXVII" "XXVIII" "XXIX" "XXX" "XXXI" "XXXII" "XXXIII" "XXXIV" "XXXV" "XXXVI" "XXXVII" "XXXVIII" "XXXIX" "XL" "XLI" "XLII" "XLIII" "XLIV" "XLV" "XLVI" "XLVII" "XLVIII" "XLIX" "L" "LI" "LII" "LIII" "LIV" "LV" "LVI" "LVII" "LVIII" "LIX" "LX" "LXI" "LXII" "LXIII" "LXIV" "LXV" "LXVI" "LXVII" "LXVIII" "LXIX" "LXX" "LXXI" "LXXII" "LXXIII" "LXXIV" "LXXV" "LXXVI" "LXXVII" "LXXVIII" "LXXIX" "LXXX" "LXXXI" "LXXXII" "LXXXIII" "LXXXIV" "LXXXV" "LXXXVI" "LXXXVII" "LXXXVIII" "LXXXIX" "XC" "XCI" "XCII" "XCIII" "XCIV" "XCV" "XCVI" "XCVII" "XCVIII" "XCIX" "C" "CI" "CII" "CIII" "CIV" "CV" "CVI" "CVII" "CVIII" "CIX" "CX" "CXI" "CXII" "CXIII" "CXIV" "CXV" "CXVI" "CXVII" "CXVIII" "CXIX" "CXX" "CXXI" "CXXII" "CXXIII" "CXXIV" "CXXV" "CXXVI" "CXXVII" "CXXVIII" "CXXIX" "CXXX" "CXXXI" "CXXXII" "CXXXIII" "CXXXIV" "CXXXV" "CXXXVI" "CXXXVII" "CXXXVIII" "CXXXIX" "CXL" "CXLI" "CXLII" "CXLIII" "CXLIV" "CXLV" "CXLVI" "CXLVII" "CXLVIII" "CXLIX" "CL" "CLI" "CLII" "CLIII" "CLIV") | |
| categories=("comedy" "history" "tragedy") | |
| for category in "${categories[@]}"; do | |
| if [ ! -d "${category}" ]; then | |
| mkdir "${category}" | |
| fi | |
| works="${category}[@]" | |
| for work in "${!works}"; do | |
| echo "${category}: ${work}" | |
| curl -L --no-progress-meter "http://shakespeare.mit.edu/${work}/full.html" | | |
| pandoc -f html -t plain -- > "${category}/${work}.txt" | |
| done | |
| done | |
| if [ ! -d "poetry" ] || [ ! -d "poetry/sonnets" ]; then | |
| mkdir -p "poetry/sonnets" | |
| fi | |
| for poem in "${poetry[@]}"; do | |
| echo "poetry: ${poem}" | |
| curl -L --no-progress-meter "http://shakespeare.mit.edu/Poetry/${poem}.html" | | |
| pandoc -f html -t plain -- > "poetry/${poem}.txt" | |
| done | |
| declare -i sonnet_no | |
| sonnet_no=0 | |
| for sonnet in "${sonnets[@]}"; do | |
| printf "sonnets: %03d - ${sonnet}\n" ${sonnet_no} | |
| curl -L --no-progress-meter "http://shakespeare.mit.edu/Poetry/sonnet.${sonnet}.html" | | |
| pandoc -f html -t plain -- > "poetry/sonnets/${sonnet_no} - ${sonnet}.txt" | |
| sonnet_no+=1 | |
| done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment