From 56875318489b11c96d0360f7458bb3c6f1a25dc9 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Fri, 31 Dec 2010 12:09:15 -0800 Subject: [PATCH 1/5] Bookmark plugin: graceful error out for failure to import a delicious bookmark due to it being already bookmarked --- .../Bookmark/deliciousbookmarkimporter.php | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/plugins/Bookmark/deliciousbookmarkimporter.php b/plugins/Bookmark/deliciousbookmarkimporter.php index 297ef81246..545c336860 100644 --- a/plugins/Bookmark/deliciousbookmarkimporter.php +++ b/plugins/Bookmark/deliciousbookmarkimporter.php @@ -96,13 +96,19 @@ class DeliciousBookmarkImporter extends QueueHandler $addDate = $a->getAttribute('add_date'); $created = common_sql_date(intval($addDate)); - $saved = Bookmark::saveNew($user->getProfile(), - $title, - $url, - $tags, - $description, - array('created' => $created, - 'distribute' => false)); + try { + $saved = Bookmark::saveNew($user->getProfile(), + $title, + $url, + $tags, + $description, + array('created' => $created, + 'distribute' => false)); + } catch (ClientException $e) { + // Most likely a duplicate -- continue on with the rest! + common_log(LOG_ERR, "Error importing delicious bookmark to $url: " . $e->getMessage()); + return true; + } return true; } From fedfde9bbb3c898328c6395a41c3243d6e97a2bf Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Fri, 31 Dec 2010 12:09:54 -0800 Subject: [PATCH 2/5] Bookmark plugin: fixes for bad DOM element nesting in delicious import data delicious bookmark exports use the godawful HTML bookmark file format that ancient versions of Netscape used (and has thus been the common import/export format for bookmarks since the dark ages of the web :) This arranges bookmark entries as an HTML definition list, using a lot of implied close tags (leaving off the and ). DOMDocument->loadHTML() uses libxml2's HTML mode, which generally does ok with muddling through things but apparently is really, really bad about handling those implied close tags. Sequences of adjacent
elements (eg bookmark without a description, followed by another bookmark "
"), end up interpreted as nested ("
") instead of as siblings ("
"). The first round of code tried to resolve the nesting inline, but ended up a bit funky in places. I've replaced this with a standalone run through the data to re-order the elements, based on our knowing that
and
cannot directly contain one another; once that's done, our main logic loop can be a bit cleaner. I'm not 100% sure it's doing nested sublists correctly, but these don't seem to show up in delicious export (and even if they do, with the way we flatten the input it shouldn't make a difference). Also fixed a clearer edge case where some bookmarks didn't get imported when missing descriptions. --- plugins/Bookmark/deliciousbackupimporter.php | 117 ++++++++++++++++--- 1 file changed, 101 insertions(+), 16 deletions(-) diff --git a/plugins/Bookmark/deliciousbackupimporter.php b/plugins/Bookmark/deliciousbackupimporter.php index 1b55115d6d..bc5a91be80 100644 --- a/plugins/Bookmark/deliciousbackupimporter.php +++ b/plugins/Bookmark/deliciousbackupimporter.php @@ -65,7 +65,7 @@ class DeliciousBackupImporter extends QueueHandler * and import to StatusNet as Bookmark activities. * * The document format is terrible. It consists of a
with - * a bunch of
's, occasionally with
's. + * a bunch of
's, occasionally with
's adding descriptions. * There are sometimes

's lost inside. * * @param array $data pair of user, text @@ -99,6 +99,9 @@ class DeliciousBackupImporter extends QueueHandler } switch (strtolower($child->tagName)) { case 'dt': + //

nodes contain primary information about a bookmark. + // We can't import the current one just yet though, since + // it may be followed by a
. if (!empty($dt)) { // No DD provided $this->importBookmark($user, $dt); @@ -109,10 +112,13 @@ class DeliciousBackupImporter extends QueueHandler case 'dd': $dd = $child; + // This
contains a description for the bookmark in + // the preceding
node. $saved = $this->importBookmark($user, $dt, $dd); $dt = null; $dd = null; + break; case 'p': common_log(LOG_INFO, 'Skipping the

in the

.'); break; @@ -126,6 +132,14 @@ class DeliciousBackupImporter extends QueueHandler $dt = $dd = null; } } + if (!empty($dt)) { + // There was a final bookmark without a description. + try { + $this->importBookmark($user, $dt); + } catch (Exception $e) { + common_log(LOG_ERR, $e->getMessage()); + } + } return true; } @@ -148,21 +162,6 @@ class DeliciousBackupImporter extends QueueHandler function importBookmark($user, $dt, $dd = null) { - // We have to go squirrelling around in the child nodes - // on the off chance that we've received another
- // as a child. - - for ($i = 0; $i < $dt->childNodes->length; $i++) { - $child = $dt->childNodes->item($i); - if ($child->nodeType == XML_ELEMENT_NODE) { - if ($child->tagName == 'dt' && !is_null($dd)) { - $this->importBookmark($user, $dt); - $this->importBookmark($user, $child, $dd); - return; - } - } - } - $qm = QueueManager::get(); $qm->enqueue(array($user, $dt, $dd), 'dlcsbkmk'); @@ -188,9 +187,95 @@ class DeliciousBackupImporter extends QueueHandler error_reporting($old); if ($ok) { + foreach ($dom->getElementsByTagName('body') as $node) { + $this->fixListsIn($node); + } return $dom; } else { return null; } } + + + function fixListsIn(DOMNode $body) { + $toFix = array(); + + foreach ($body->childNodes as $node) { + if ($node->nodeType == XML_ELEMENT_NODE) { + $el = strtolower($node->nodeName); + if ($el == 'dl') { + $toFix[] = $node; + } + } + } + + foreach ($toFix as $node) { + $this->fixList($node); + } + } + + function fixList(DOMNode $list) { + $toFix = array(); + + foreach ($list->childNodes as $node) { + if ($node->nodeType == XML_ELEMENT_NODE) { + $el = strtolower($node->nodeName); + if ($el == 'dt' || $el == 'dd') { + $toFix[] = $node; + } + if ($el == 'dl') { + // Sublist. + // Technically, these can only appear inside a
... + $this->fixList($node); + } + } + } + + foreach ($toFix as $node) { + $this->fixListItem($node); + } + } + + function fixListItem(DOMNode $item) { + // The HTML parser in libxml2 doesn't seem to properly handle + // many cases of implied close tags, apparently because it doesn't + // understand the nesting rules specified in the HTML DTD. + // + // This leads to sequences of adjacent
s or
s being incorrectly + // interpreted as parent->child trees instead of siblings: + // + // When parsing this input: "
aaa
bbb" + // should be equivalent to: "
aaa
bbb
" + // but we're seeing instead: "
aaa
bbb
" + // + // It does at least know that going from dt to dd, or dd to dt, + // should make a break. + + $toMove = array(); + + foreach ($item->childNodes as $node) { + if ($node->nodeType == XML_ELEMENT_NODE) { + $el = strtolower($node->nodeName); + if ($el == 'dt' || $el == 'dd') { + // dt & dd cannot contain each other; + // This node was incorrectly placed; move it up a level! + $toMove[] = $node; + } + if ($el == 'dl') { + // Sublist. + // Technically, these can only appear inside a
. + $this->fixList($node); + } + } + } + + $parent = $item->parentNode; + $next = $item->nextSibling; + foreach ($toMove as $node) { + $item->removeChild($node); + $parent->insertBefore($node, $next); + $this->fixListItem($node); + } + } + } From 3368c33be776e41d603ab1c2149fe8b111877beb Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Fri, 31 Dec 2010 12:33:51 -0800 Subject: [PATCH 3/5] Bookmark plugin: fix for delicious import with queues enabled We were passing DOM nodes directly into the queues for the final bookmark import stage; unfortunately these don't actually survive serialization. Moved the extraction of properties from the HTML up to the first-stage handler, so now we don't have to worry about moving DOM nodes from one handler to the next. Instead passing an associative array of properties, which is fed into the Bookmark::saveNew by the per-bookmark handler. --- plugins/Bookmark/deliciousbackupimporter.php | 33 +++++++++++++- .../Bookmark/deliciousbookmarkimporter.php | 44 ++++--------------- 2 files changed, 40 insertions(+), 37 deletions(-) diff --git a/plugins/Bookmark/deliciousbackupimporter.php b/plugins/Bookmark/deliciousbackupimporter.php index bc5a91be80..197c7a143b 100644 --- a/plugins/Bookmark/deliciousbackupimporter.php +++ b/plugins/Bookmark/deliciousbackupimporter.php @@ -162,9 +162,38 @@ class DeliciousBackupImporter extends QueueHandler function importBookmark($user, $dt, $dd = null) { + $as = $dt->getElementsByTagName('a'); + + if ($as->length == 0) { + throw new ClientException(_("No tag in a
.")); + } + + $a = $as->item(0); + + $private = $a->getAttribute('private'); + + if ($private != 0) { + throw new ClientException(_('Skipping private bookmark.')); + } + + if (!empty($dd)) { + $description = $dd->nodeValue; + } else { + $description = null; + } + $addDate = $a->getAttribute('add_date'); + + $data = array( + 'profile_id' => $user->id, + 'title' => $a->nodeValue, + 'description' => $description, + 'url' => $a->getAttribute('href'), + 'tags' => $a->getAttribute('tags'), + 'created' => common_sql_date(intval($addDate)) + ); + $qm = QueueManager::get(); - - $qm->enqueue(array($user, $dt, $dd), 'dlcsbkmk'); + $qm->enqueue($data, 'dlcsbkmk'); } /** diff --git a/plugins/Bookmark/deliciousbookmarkimporter.php b/plugins/Bookmark/deliciousbookmarkimporter.php index 545c336860..018239f49d 100644 --- a/plugins/Bookmark/deliciousbookmarkimporter.php +++ b/plugins/Bookmark/deliciousbookmarkimporter.php @@ -61,52 +61,26 @@ class DeliciousBookmarkImporter extends QueueHandler /** * Handle the data * - * @param array $data array of user, dt, dd + * @param array $data associative array of user & bookmark info from DeliciousBackupImporter::importBookmark() * * @return boolean success value */ function handle($data) { - list($user, $dt, $dd) = $data; - - $as = $dt->getElementsByTagName('a'); - - if ($as->length == 0) { - throw new ClientException(_("No tag in a
.")); - } - - $a = $as->item(0); - - $private = $a->getAttribute('private'); - - if ($private != 0) { - throw new ClientException(_('Skipping private bookmark.')); - } - - if (!empty($dd)) { - $description = $dd->nodeValue; - } else { - $description = null; - } - - $title = $a->nodeValue; - $url = $a->getAttribute('href'); - $tags = $a->getAttribute('tags'); - $addDate = $a->getAttribute('add_date'); - $created = common_sql_date(intval($addDate)); + $profile = Profile::staticGet('id', $data['profile_id']); try { - $saved = Bookmark::saveNew($user->getProfile(), - $title, - $url, - $tags, - $description, - array('created' => $created, + $saved = Bookmark::saveNew($profile, + $data['title'], + $data['url'], + $data['tags'], + $data['description'], + array('created' => $data['created'], 'distribute' => false)); } catch (ClientException $e) { // Most likely a duplicate -- continue on with the rest! - common_log(LOG_ERR, "Error importing delicious bookmark to $url: " . $e->getMessage()); + common_log(LOG_ERR, "Error importing delicious bookmark to $data[url]: " . $e->getMessage()); return true; } From ae59046b1e5dd2c55e074e8e3d24e04ce8001b8e Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Fri, 31 Dec 2010 12:42:26 -0800 Subject: [PATCH 4/5] Bookmark plugin: tweak post-upload success message to distinguish between "already done" (UnQueueManager) and "started, should finish eventually" (other queue manager) --- plugins/Bookmark/importdelicious.php | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/plugins/Bookmark/importdelicious.php b/plugins/Bookmark/importdelicious.php index f8529cc914..b98b215717 100644 --- a/plugins/Bookmark/importdelicious.php +++ b/plugins/Bookmark/importdelicious.php @@ -48,6 +48,7 @@ if (!defined('STATUSNET')) { class ImportdeliciousAction extends Action { protected $success = false; + private $inprogress = false; /** * Return the title of the page @@ -191,7 +192,13 @@ class ImportdeliciousAction extends Action $qm = QueueManager::get(); $qm->enqueue(array(common_current_user(), $html), 'dlcsback'); - $this->success = true; + if ($qm instanceof UnQueueManager) { + // No active queuing means we've actually just completed the job! + $this->success = true; + } else { + // We've fed data into background queues, and it's probably still running. + $this->inprogress = true; + } $this->showPage(); @@ -212,8 +219,10 @@ class ImportdeliciousAction extends Action { if ($this->success) { $this->element('p', null, - _('Feed will be restored. '. - 'Please wait a few minutes for results.')); + _('Bookmarks have been imported. Your bookmarks should now appear in search and your profile page.')); + } else if ($this->inprogress) { + $this->element('p', null, + _('Bookmarks are being imported. Please wait a few minutes for results.')); } else { $form = new ImportDeliciousForm($this); $form->show(); From 98a0d7f538bad0b365adb1ec7eacb8c86c15384b Mon Sep 17 00:00:00 2001 From: Evan Prodromou Date: Mon, 3 Jan 2011 10:38:32 -0800 Subject: [PATCH 5/5] Configuration options for using an HTTP proxy We can make a lot of HTTP requests from the server side. This change adds some configuration options for using an HTTP proxy, which can cache hits from multiple sites (good for status.net-like services, for example). --- README | 16 ++++++++++++++++ lib/default.php | 5 +++++ lib/httpclient.php | 8 ++++++++ 3 files changed, 29 insertions(+) diff --git a/README b/README index e2e4c580ef..d972bf5676 100644 --- a/README +++ b/README @@ -1556,6 +1556,22 @@ cache: whether to cache the router in memcache (or another caching router cached) or others who see strange behavior. You're unlikely to need this unless you're a developer. +http +---- + +Settings for the HTTP client. + +ssl_cafile: location of the CA file for SSL. If not set, won't verify + SSL peers. Default unset. +curl: Use cURL for doing HTTP calls. You must + have the PHP curl extension installed for this to work. +proxy_host: Host to use for proxying HTTP requests. If unset, doesn't + do any HTTP proxy stuff. Default unset. +proxy_port: Port to use to connect to HTTP proxy host. Default null. +proxy_user: Username to use for authenticating to the HTTP proxy. Default null. +proxy_password: Password to use for authenticating to the HTTP proxy. Default null. +proxy_auth_scheme: Scheme to use for authenticating to the HTTP proxy. Default null. + Plugins ======= diff --git a/lib/default.php b/lib/default.php index 6d57c4ef02..ce61de5ea5 100644 --- a/lib/default.php +++ b/lib/default.php @@ -331,6 +331,11 @@ $default = 'http' => // HTTP client settings when contacting other sites array('ssl_cafile' => false, // To enable SSL cert validation, point to a CA bundle (eg '/usr/lib/ssl/certs/ca-certificates.crt') 'curl' => false, // Use CURL backend for HTTP fetches if available. (If not, PHP's socket streams will be used.) + 'proxy_host' => null, + 'proxy_port' => null, + 'proxy_user' => null, + 'proxy_password' => null, + 'proxy_auth_scheme' => null, ), 'router' => array('cache' => true), // whether to cache the router object. Defaults to true, turn off for devel diff --git a/lib/httpclient.php b/lib/httpclient.php index 514a5afeb2..04e2b9ac65 100644 --- a/lib/httpclient.php +++ b/lib/httpclient.php @@ -149,6 +149,14 @@ class HTTPClient extends HTTP_Request2 $this->config['adapter'] = 'HTTP_Request2_Adapter_Curl'; } + foreach (array('host', 'port', 'user', 'password', 'auth_scheme') as $cf) { + $k = 'proxy_'.$cf; + $v = common_config('http', $k); + if (!empty($v)) { + $this->config[$k] = $v; + } + } + parent::__construct($url, $method, $config); $this->setHeader('User-Agent', $this->userAgent()); }