robots.php 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. <?php if (!defined('PmWiki')) exit();
  2. /* Copyright 2005-2006 Patrick R. Michaud (pmichaud@pobox.com)
  3. This file is part of PmWiki; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published
  5. by the Free Software Foundation; either version 2 of the License, or
  6. (at your option) any later version. See pmwiki.php for full details.
  7. This file provides various features to allow PmWiki to control
  8. what web crawlers (robots) see when they visit the site. Of course
  9. it's still possible to control robots at the webserver level
  10. and via robots.txt, but this page provides some finer level
  11. of control.
  12. The $MetaRobots variable controls generation of the
  13. <meta name='robots' ... /> tag in the head of the HTML document.
  14. By default $MetaRobots is set so that robots do not index pages in
  15. the Site and PmWiki groups.
  16. The $RobotPattern variable is used to determine if the user agent
  17. accessing the site is a robot, and $IsRobotAgent is set accordingly.
  18. By default this pattern identifies Googlebot, Yahoo! Slurp, msnbot,
  19. BecomeBot, and HTTrack as robots.
  20. If the agent is deemed a robot, then the $RobotActions array is
  21. checked to see if robots are allowed to perform the given action,
  22. and if not the robot is immediately sent an HTTP 403 Forbidden
  23. response.
  24. If $EnableRobotCloakActions is set, then a pattern is added to
  25. $FmtP to hide any "?action=" url parameters in page urls
  26. generated by PmWiki for actions that robots aren't allowed to
  27. access. This can greatly reduce the load on the server by
  28. not providing the robot with links to pages that it will be
  29. forbidden to index anyway.
  30. */
  31. ## $MetaRobots provides the value for the <meta name='robots' ...> tag.
  32. SDV($MetaRobots,
  33. ($action!='browse' || preg_match('#^PmWiki[./](?!PmWiki$)|^Site[./]#',
  34. $pagename)) ? 'noindex,nofollow' : 'index,follow');
  35. if ($MetaRobots)
  36. $HTMLHeaderFmt['robots'] =
  37. " <meta name='robots' content='\$MetaRobots' />\n";
  38. ## $RobotPattern is used to identify robots.
  39. SDV($RobotPattern,'Googlebot|Slurp|msnbot|Teoma|ia_archiver|BecomeBot|HTTrack|MJ12bot');
  40. SDV($IsRobotAgent,
  41. $RobotPattern && preg_match("!$RobotPattern!", @$_SERVER['HTTP_USER_AGENT']));
  42. if (!$IsRobotAgent) return;
  43. ## $RobotActions indicates which actions a robot is allowed to perform.
  44. SDVA($RobotActions, array('browse' => 1, 'rss' => 1, 'dc' => 1));
  45. if (!@$RobotActions[$action]) {
  46. header("HTTP/1.1 403 Forbidden");
  47. print("<h1>Forbidden</h1>");
  48. exit();
  49. }
  50. ## The following removes any ?action= parameters that robots aren't
  51. ## allowed to access.
  52. if (IsEnabled($EnableRobotCloakActions, 0)) {
  53. $p = create_function('$a', 'return (boolean)$a;');
  54. $p = join('|', array_keys(array_filter($RobotActions, $p)));
  55. $FmtP["/(\\\$ScriptUrl[^#\"'\\s<>]+)\?action=(?!$p)\\w+/"] = '$1';
  56. }