From dc62246443e3584ef5267505275f618f6fa86bf7 Mon Sep 17 00:00:00 2001 From: Evan Prodromou Date: Sun, 31 Jan 2010 10:12:26 -0500 Subject: [PATCH] Add a robots.txt URL to the site root Adds a robots.txt file to the site root. Defaults defined by 'robotstxt' section of config. New events StartRobotsTxt and EndRobotsTxt to let plugins add information. Probably not useful if path is not /, but won't hurt anything, either. --- EVENTS.txt | 6 +++ README | 14 ++++++ actions/robotstxt.php | 100 ++++++++++++++++++++++++++++++++++++++++++ index.php | 5 ++- lib/default.php | 4 ++ lib/router.php | 2 + 6 files changed, 129 insertions(+), 2 deletions(-) create mode 100644 actions/robotstxt.php diff --git a/EVENTS.txt b/EVENTS.txt index 3317c80dec..6bf12bf13f 100644 --- a/EVENTS.txt +++ b/EVENTS.txt @@ -708,3 +708,9 @@ EndUserRegister: When a new user has been registered - &$profile: new profile data - &$user: new user account +StartRobotsTxt: Before outputting the robots.txt page +- &$action: RobotstxtAction being shown + +EndRobotsTxt: After the default robots.txt page (good place for customization) +- &$action: RobotstxtAction being shown + diff --git a/README b/README index da278f7412..4e576dcdd3 100644 --- a/README +++ b/README @@ -1496,6 +1496,20 @@ interface. It also makes the user's profile the root URL. enabled: Whether to run in "single user mode". Default false. nickname: nickname of the single user. +robotstxt +--------- + +We put out a default robots.txt file to guide the processing of +Web crawlers. See http://www.robotstxt.org/ for more information +on the format of this file. + +crawldelay: if non-empty, this value is provided as the Crawl-Delay: + for the robots.txt file. see http://ur1.ca/l5a0 + for more information. Default is zero, no explicit delay. +disallow: Array of (virtual) directories to disallow. Default is 'main', + 'search', 'message', 'settings', 'admin'. Ignored when site + is private, in which case the entire site ('/') is disallowed. + Plugins ======= diff --git a/actions/robotstxt.php b/actions/robotstxt.php new file mode 100644 index 0000000000..5131097c8c --- /dev/null +++ b/actions/robotstxt.php @@ -0,0 +1,100 @@ + + * @license http://www.fsf.org/licensing/licenses/agpl.html AGPLv3 + * @link http://status.net/ + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +if (!defined('STATUSNET')) { + exit(1); +} + +/** + * Prints out a static robots.txt + * + * @category Action + * @package StatusNet + * @author Evan Prodromou + * @license http://www.fsf.org/licensing/licenses/agpl.html AGPLv3 + * @link http://status.net/ + */ + +class RobotstxtAction extends Action +{ + /** + * Handles requests + * + * Since this is a relatively static document, we + * don't do a prepare() + * + * @param array $args GET, POST, and URL params; unused. + * + * @return void + */ + + function handle($args) + { + if (Event::handle('StartRobotsTxt', array($this))) { + + header('Content-Type: text/plain'); + + print "User-Agent: *\n"; + + if (common_config('site', 'private')) { + + print "Disallow: /\n"; + + } else { + + $disallow = common_config('robotstxt', 'disallow'); + + foreach ($disallow as $dir) { + print "Disallow: /$dir/\n"; + } + + $crawldelay = common_config('robotstxt', 'crawldelay'); + + if (!empty($crawldelay)) { + print "Crawl-delay: " . $crawldelay . "\n"; + } + } + + Event::handle('EndRobotsTxt', array($this)); + } + } + + /** + * Return true; this page doesn't touch the DB. + * + * @param array $args other arguments + * + * @return boolean is read only action? + */ + + function isReadOnly($args) + { + return true; + } +} diff --git a/index.php b/index.php index 605b380bf8..06ff9900fd 100644 --- a/index.php +++ b/index.php @@ -285,8 +285,9 @@ function main() if (!$user && common_config('site', 'private') && !isLoginAction($action) && !preg_match('/rss$/', $action) - && !preg_match('/^Api/', $action) - ) { + && $action != 'robotstxt' + && !preg_match('/^Api/', $action)) { + // set returnto $rargs =& common_copy_args($args); unset($rargs['action']); diff --git a/lib/default.php b/lib/default.php index 1337a96336..2bedc4bf08 100644 --- a/lib/default.php +++ b/lib/default.php @@ -270,4 +270,8 @@ $default = 'singleuser' => array('enabled' => false, 'nickname' => null), + 'robotstxt' => + array('crawldelay' => 0, + 'disallow' => array('main', 'settings', 'admin', 'search', 'message') + ), ); diff --git a/lib/router.php b/lib/router.php index ca9f328126..4b5b8d0bb8 100644 --- a/lib/router.php +++ b/lib/router.php @@ -73,6 +73,8 @@ class Router if (Event::handle('StartInitializeRouter', array(&$m))) { + $m->connect('robots.txt', array('action' => 'robotstxt')); + $m->connect('opensearch/people', array('action' => 'opensearch', 'type' => 'people')); $m->connect('opensearch/notice', array('action' => 'opensearch',