Created
March 27, 2012 00:57
-
-
Save gplv2/2211303 to your computer and use it in GitHub Desktop.
Revisions
-
gplv2 revised this gist
Mar 28, 2012 . 1 changed file with 49 additions and 16 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,13 +1,34 @@ # To relieve servers ##Imagine a robots.txt file like this (Google understands this format): #User-agent: * #Disallow: /detailed #Disallow: /?action=detailed #Disallow: /*/detailed #Crawl-delay: 20 ## # to enable these rules , save them to httpd.conf (debian/ubuntu) and include the following 2 lines in each VirtualHost directive # RewriteEngine On # RewriteOptions Inherit # Then this will work in your virtualservers as wel as in the main, except for those you don't set up # And you want to enforce those policies, you can do this: # put this below in your httpd.conf file RewriteEngine On # Set a general robots.txt file (For all virtual hosts here) to a file apache can access RewriteRule ^/robots.txt$ /etc/apache2/robots.txt [L] # a robots.txt file is the now the only file that will be allowed to be downloaded by any bots blocked here below # Block fake google when it's not coming from their IP range's (A fake googlebot) [F] => Failure RewriteCond %{HTTP:X-FORWARDED-FOR} !^66\.249\.(6[4-9]|[78][0-9]|9[0-5])\. RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Googlebot/2\.[01];\ \+http://www\.google\.com/bot\.html\)$ [NC] RewriteRule .* - [F,L] # End if match # IF THE UA STARTS WITH THESE RewriteCond %{HTTP_USER_AGENT} ^(aesop_com_spiderman|alexibot|backweb|bandit|batchftp|bigfoot) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(black.?hole|blackwidow|blowfish|botalot|buddy|builtbottough|bullseye) [NC,OR] @@ -33,25 +54,38 @@ RewriteCond %{HTTP_USER_AGENT} ^(superhttp|surfbot|asterias|suzuran|szukacz|take RewriteCond %{HTTP_USER_AGENT} ^(telesoft|the.?intraformant|thenomad|tighttwatbot|titan|urldispatcher) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(turingos|turnitinbot|urly.?warning|vacuum|vci|voideye|whacker) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(libwww-perl|widow|wisenutbot|wwwoffle|xaldon|xenu|zeus|zyborg|anonymouse) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(Sogou\ web\ spider) [NC] # ISSUE 403 / SERVE ERRORDOCUMENT RewriteRule . - [F,L] # End if match # Block real Engines , not respecting robots.txt but allowing correct calls to pass (all detail searches basically) # It seemst to take about 2 days of 403 to make it respect the robots.txt file, even though this one got downloaded several times # Google RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Googlebot/2\.[01];\ \+http://www\.google\.com/bot\.html\)$ [NC,OR] # Bing RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ bingbot/2\.[01];\ \+http://www\.bing\.com/bingbot\.htm\)$ [NC,OR] # msnbot RewriteCond %{HTTP_USER_AGENT} ^msnbot-media/1\.[01]\ \(\+http://search\.msn\.com/msnbot\.htm\)$ [NC,OR] # Slurp RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Yahoo!\ Slurp;\ http://help\.yahoo\.com/help/us/ysearch/slurp\)$ [NC] # block all detail searches, the rest may pass (things like /detailed, /EN/detailed and ?action=detailed RewriteCond %{REQUEST_URI} ^(/detailed|/[A-Z]{2}/detailed/) [OR] # or with the action=detailed key set RewriteCond %{QUERY_STRING} action=detailed # ISSUE 403 / SERVE ERRORDOCUMENT RewriteRule .* - [F,L] # End if match # Defenite blocks RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ MJ12bot/v1\.4\.2;\ http://www\.majestic12\.co\.uk/bot\.php\?\+\)$ [NC,OR] # Baidus RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Baiduspider/2\.[01];\ \+http://www\.baidu\.com/search/spider\.html\)$ [NC,OR] # Deepspider RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ OpenindexDeepSpider/Nutch-1\.[0-9]-dev;\ \+http://www\.openindex\.io/en/webmasters/spider\.html\)$ [NC,OR] # Known user agent strings defenitely belonging to bad spiders RewriteCond %{HTTP_USER_AGENT} ^web(zip|emaile|enhancer|fetch|go.?is|auto|bandit|clip|copier|master|reaper|sauger|site.?quester|whack) [NC,OR] # Yandex (russian google) @@ -63,14 +97,12 @@ RewriteCond %{HTTP_USER_AGENT} ^Pingdom\.com_bot_version_1\.4_\(http://www\.ping # AhrefsBot RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ AhrefsBot/2\.[01];\ \+http://ahrefs\.com/robot/\)$ [NC,OR] # Block a rogue facebook application ? #RewriteCond %{HTTP_USER_AGENT} ^facebookexternalhit/1\.1\ \(\+http://www\.facebook\.com/externalhit_uatext.php\)$ [NC,OR] # # Vagabondo RewriteCond %{HTTP_USER_AGENT} ^Mozilla/4\.[01]\ \(compatible;\ \ Vagabondo/4\.0;\ webcrawler\ at\ wise-guys\ dot\ nl;\ http://webagent\.wise-guys\.nl/;\ http://www\.wise-guys\.nl/\) [NC,OR] # Ezooms RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Ezooms/1\.[01];\ ezooms\.bot@gmail\.com\)$ [NC,OR] @@ -81,4 +113,5 @@ RewriteCond %{HTTP_USER_AGENT} ^Synthesio\ Crawler\ release\ MonaLisa\ \(contact RewriteCond %{HTTP_USER_AGENT} ^.*(craftbot|download|extract|stripper|sucker|ninja|clshttp|webspider|leacher|collector|grabber|webpictures).*$ [NC] # ISSUE 403 / SERVE ERRORDOCUMENT RewriteRule . - [F,L] # End if match -
gplv2 created this gist
Mar 27, 2012 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,84 @@ # To relieve the server RewriteEngine On # a robots.txt file is the only file that will be allowed to be downloaded by those bots now, for all virtual servers. RewriteRule ^/robots.txt$ /etc/apache2/robots.txt [L] # to enable these rules , save them to httpd.conf (debian/ubuntu) and include the following 2 lines in each VirtualHost directive # RewriteEngine On # RewriteOptions Inherit # IF THE UA STARTS WITH THESE RewriteCond %{HTTP_USER_AGENT} ^(aesop_com_spiderman|alexibot|backweb|bandit|batchftp|bigfoot) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(black.?hole|blackwidow|blowfish|botalot|buddy|builtbottough|bullseye) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(cheesebot|cherrypicker|chinaclaw|collector|copier|copyrightcheck) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(cosmos|crescent|curl|custo|da|diibot|disco|dittospyder|dragonfly) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(drip|easydl|ebingbong|ecatch|eirgrabber|emailcollector|emailsiphon) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(emailwolf|erocrawler|exabot|eyenetie|filehound|flashget|flunky) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(frontpage|getright|getweb|go.?zilla|go-ahead-got-it|gotit|grabnet) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(grafula|harvest|hloader|hmview|httplib|httrack|humanlinks|ilsebot) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(infonavirobot|infotekies|intelliseek|interget|iria|jennybot|jetcar) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(joc|justview|jyxobot|kenjin|keyword|larbin|leechftp|lexibot|lftp|libweb) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(likse|linkscan|linkwalker|lnspiderguy|lwp|magnet|mag-net|markwatch) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(mata.?hari|memo|microsoft.?url|midown.?tool|miixpc|mirror|missigua) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(mister.?pix|moget|mozilla.?newt|nameprotect|navroad|backdoorbot|nearsite) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(net.?vampire|netants|netcraft|netmechanic|netspider|nextgensearchbot) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(attach|nicerspro|nimblecrawler|npbot|octopus|offline.?explorer) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(offline.?navigator|openfind|outfoxbot|pagegrabber|papa|pavuk) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(pcbrowser|php.?version.?tracker|pockey|propowerbot|prowebwalker) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(psbot|pump|queryn|recorder|realdownload|reaper|reget|true_robot) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(repomonkey|rma|internetseer|sitesnagger|siphon|slysearch|smartdownload) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(snake|snapbot|snoopy|sogou|spacebison|spankbot|spanner|sqworm|superbot) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(superhttp|surfbot|asterias|suzuran|szukacz|takeout|teleport) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(telesoft|the.?intraformant|thenomad|tighttwatbot|titan|urldispatcher) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(turingos|turnitinbot|urly.?warning|vacuum|vci|voideye|whacker) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(libwww-perl|widow|wisenutbot|wwwoffle|xaldon|xenu|zeus|zyborg|anonymouse) [NC,OR] RewriteCond %{HTTP_USER_AGENT} ^(Sogou\ web\ spider) [NC,OR] # Block google RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Googlebot/2\.[01];\ \+http://www\.google\.com/bot\.html\)$ [NC,OR] # Block bing RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ bingbot/2\.[01];\ \+http://www\.bing\.com/bingbot\.htm\)$ [NC,OR] # Block msnbot RewriteCond %{HTTP_USER_AGENT} ^msnbot-media/1\.[01]\ \(\+http://search\.msn\.com/msnbot\.htm\)$ [NC,OR] # Some idiot bot RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ MJ12bot/v1\.4\.2;\ http://www\.majestic12\.co\.uk/bot\.php\?\+\)$ [NC,OR] # Baidus RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Baiduspider/2\.[01];\ \+http://www\.baidu\.com/search/spider\.html\)$ [NC,OR] # Deepspider RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ OpenindexDeepSpider/Nutch-1\.[0-9]-dev;\ \+http://www\.openindex\.io/en/webmasters/spider\.html\)$ [NC,OR] # STARTS WITH WEB RewriteCond %{HTTP_USER_AGENT} ^web(zip|emaile|enhancer|fetch|go.?is|auto|bandit|clip|copier|master|reaper|sauger|site.?quester|whack) [NC,OR] # Yandex (russian google) RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ YandexBot/3\.[01];\ \+http://yandex\.com/bots\)$ [NC,OR] # Pingdom RewriteCond %{HTTP_USER_AGENT} ^Pingdom\.com_bot_version_1\.4_\(http://www\.pingdom\.com/\) [NC,OR] # AhrefsBot RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ AhrefsBot/2\.[01];\ \+http://ahrefs\.com/robot/\)$ [NC,OR] # Some rogue facebook faker ? #RewriteCond %{HTTP_USER_AGENT} ^facebookexternalhit/1\.1\ \(\+http://www\.facebook\.com/externalhit_uatext.php\)$ [NC,OR] # Vagabondo RewriteCond %{HTTP_USER_AGENT} ^Mozilla/4\.[01]\ \(compatible;\ \ Vagabondo/4\.0;\ webcrawler\ at\ wise-guys\ dot\ nl;\ http://webagent\.wise-guys\.nl/;\ http://www\.wise-guys\.nl/\) [NC,OR] # Slurp RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Yahoo!\ Slurp;\ http://help\.yahoo\.com/help/us/ysearch/slurp\)$ [NC,OR] # Ezooms RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Ezooms/1\.[01];\ ezooms\.bot@gmail\.com\)$ [NC,OR] # Monalisa RewriteCond %{HTTP_USER_AGENT} ^Synthesio\ Crawler\ release\ MonaLisa\ \(contact\ at\ synthesio\ dot\ fr\)$ [NC,OR] # ANYWHERE IN UA -- GREEDY REGEX RewriteCond %{HTTP_USER_AGENT} ^.*(craftbot|download|extract|stripper|sucker|ninja|clshttp|webspider|leacher|collector|grabber|webpictures).*$ [NC] # ISSUE 403 / SERVE ERRORDOCUMENT RewriteRule . - [F,L]