################################### # Robots.txt # ################################### # Sitemap: http://www.apnauttarakhand.com/sitemap.xml # Google - Most Important bot # Unfortunately a robots.txt will only stop it crawling certain urls, and NOT adding any # urls which it comes across into its index. So we're relying on a meta noindex tag. User-agent: Googlebot Allow: /sitemap.xml Disallow: /wp- Disallow: /feed/ Disallow: /trackback/ Disallow: /rss/ Disallow: /comments/feed/ Disallow: /page/ Disallow: /date/ Disallow: /comments/ Disallow: /cgi-bin/ Disallow: /200* Disallow: /*?* Disallow: /iframes/ Disallow: /recommends/ # Yahoo - Too aggressive # So limit it as much as possible. User-agent: Slurp # Disallow Everything Disallow: / # Now allow bits and then disallow bits Allow: /sitemap.xml Allow: /index.php?*;* # Anything with a ; disallow Disallow: /index.php?*;* # Bad bot - Often ignores robots.txt - Waste of bandwidth # Despite claiming on their website to be a search engine in development # I'm suspicious as to whether they are a harvester pretending to be SE User-agent: Twiceler Disallow: / User-Agent: W3C-checklink Disallow: / User-agent: TurnitinBot Disallow: / # Catch all (remainder) # Will be followed by any bots other than ones identified above # Uses BASIC robots.txt directives without wildcards, end-anchors etc # So Spiders should understand these (including MSNBOT) User-agent: * Disallow: /wp- Disallow: /feed/ Disallow: /trackback/ Disallow: /rss/ Disallow: /comments/feed/ Disallow: /page/ Disallow: /date/ Disallow: /comments/ Disallow: /cgi-bin/ Disallow: /200* Disallow: /*?* Disallow: /iframes/ Disallow: /recommends/