# robots.txt for catalogue.slwa.wa.gov.au aka henrietta.slwa.wa.gov.au also henrietta.liswa.wa.gov.au # combination of the stock iii robots.txt and the robots.txt of wikipedia http://www.wikipedia.org/ # the first part of this file picks on certain bots # to slow down the MSNBOT # apparently if msnbot and bingbot find a rule specific to itself it will ignore all other rules: # User-agent: msnbot # Crawl-delay: 5 # bingbot is not playing by the rules 23 March 2012 so # 1 April Bingbot has been playing by the rules this week, so rewarding it # 24 September 2012 Bingbot is bad see above so access blocked until further notice User-agent: msnbot Disallow: / User-agent: bingbot Disallow: / # For the WebBridge Google Scholar Extension. Allows googlebot_IA to crawl /screens # temporarily blocking Googlebot and webbridge after problems 18 to 20 May 2012 # Remove as III recommended robots.txt excludes them May 2012 # User-agent: Googlebot-IA # # Allow: /screens # Allow: /webbridge # Allow: /webbridge~S6 # advertising-related bots: User-agent: Mediapartners-Google* Disallow: / # Crawlers that are kind enough to obey, but which we'd rather not have unless they're feeding search engines. User-agent: UbiCrawler Disallow: / User-agent: DOC Disallow: / User-agent: Zao Disallow: / # Some bots are known to be trouble, particularly those designed to copy entire sites. # Please obey robots.txt. User-agent: sitecheck.internetseer.com Disallow: / User-agent: Zealbot Disallow: / User-agent: MSIECrawler Disallow: / User-agent: SiteSnagger Disallow: / User-agent: WebStripper Disallow: / User-agent: WebCopier Disallow: / User-agent: Fetch Disallow: / User-agent: Offline Explorer Disallow: / User-agent: Teleport Disallow: / User-agent: TeleportPro Disallow: / User-agent: WebZIP Disallow: / User-agent: linko Disallow: / User-agent: HTTrack Disallow: / User-agent: Microsoft.URL.Control Disallow: / User-agent: Xenu Disallow: / User-agent: larbin Disallow: / User-agent: libwww Disallow: / User-agent: ZyBORG Disallow: / User-agent: Download Ninja Disallow: / # Sorry, wget in its recursive mode is a frequent problem. User-agent: wget Disallow: / # The 'grub' distributed client has been *very* poorly behaved. User-agent: grub-client Disallow: / # Doesn't follow robots.txt anyway, but... User-agent: k2spider Disallow: / # Hits many times per second, not acceptable # http://www.nameprotect.com/botinfo.html User-agent: NPBot Disallow: / # A capture bot, downloads gazillions of pages with no public benefit # http://www.webreaper.net/ User-agent: WebReaper Disallow: / # The second part of the file instructs all WWW robots NOT to index pages that begin # with the URLS listed. User-agent: * Disallow: /acquire Disallow: /airpac Disallow: /airwkst Disallow: /articles Disallow: /availlim Disallow: /bookill Disallow: /bookit Disallow: /circhistlim Disallow: /circpix Disallow: /cisti_order Disallow: /clearhist Disallow: /documents Disallow: /donate Disallow: /extlang Disallow: /feeds Disallow: /ftlist Disallow: /goto Disallow: /iii Disallow: /ill Disallow: /illframe Disallow: /indexsort Disallow: /journill Disallow: /kids Disallow: /launch Disallow: /logout Disallow: /manage Disallow: /manual Disallow: /metafind Disallow: /mfgo Disallow: /netli Disallow: /nonret Disallow: /programs Disallow: /review Disallow: /setlang Disallow: /setscope Disallow: /suggest Disallow: /tmp Disallow: /validate Disallow: /VERIFYPATRON Disallow: /VERSION Disallow: /weblang Disallow: /webbridge Disallow: /webbridge~S2 Disallow: /webbridge~S6 Disallow: /wm Disallow: /xrecord= Disallow: /z39 Disallow: /z39m # blocking crawling of search pages until further notice NC 10 Feb2012, update on 22 March after Bingbot frenzy Disallow: /search Disallow: /search~S1 Disallow: /search~S2 Disallow: /search~S2? # temporarily blocking removed NC 6 March2012 reinstated by NC 21 September 2012 at CK request # back for Mon to Fri until further notice 24 Sept 2012 # Disallow: /record # Disallow: /record= # Disallow: /screens # Disallow: /patroninfo # Disallow: /selfreg