gnu-social/plugins/Bookmark/lib/deliciousbackupimporter.php
Mikael Nordfeldth de55d8f83b plugins onAutoload now only overloads if necessary (extlibs etc.)
lib/plugin.php now has a parent onAutoload function that finds most common
files that are used in plugins (actions, dataobjects, forms, libs etc.) if
they are put in the standardised directories ('actions', 'classes', 'forms',
'lib' and perhaps some others in the future).
2013-08-28 16:10:30 +02:00

324 lines
9.9 KiB
PHP

<?php
/**
* StatusNet - the distributed open-source microblogging tool
* Copyright (C) 2010, StatusNet, Inc.
*
* Importer class for Delicious.com backups
*
* PHP version 5
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* @category Bookmark
* @package StatusNet
* @author Evan Prodromou <evan@status.net>
* @copyright 2010 StatusNet, Inc.
* @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
* @link http://status.net/
*/
if (!defined('STATUSNET')) {
// This check helps protect against security problems;
// your code file can't be executed directly from the web.
exit(1);
}
/**
* Importer class for Delicious bookmarks
*
* @category Bookmark
* @package StatusNet
* @author Evan Prodromou <evan@status.net>
* @copyright 2010 StatusNet, Inc.
* @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
* @link http://status.net/
*/
class DeliciousBackupImporter extends QueueHandler
{
/**
* Transport of the importer
*
* @return string transport string
*/
function transport()
{
return 'dlcsback';
}
/**
* Import an in-memory bookmark list to a user's account
*
* Take a delicious.com backup file (same as Netscape bookmarks.html)
* and import to StatusNet as Bookmark activities.
*
* The document format is terrible. It consists of a <dl> with
* a bunch of <dt>'s, occasionally with <dd>'s adding descriptions.
* There are sometimes <p>'s lost inside.
*
* @param array $data pair of user, text
*
* @return boolean success value
*/
function handle($data)
{
list($user, $body) = $data;
try {
$doc = $this->importHTML($body);
} catch (ClientException $cex) {
// XXX: message to the user
common_log(LOG_WARNING, $cex->getMessage());
return true;
}
// If we can't parse it, it's no good
if (empty($doc)) {
return true;
}
$dls = $doc->getElementsByTagName('dl');
if ($dls->length != 1) {
// XXX: message to the user
common_log(LOG_WARNING, 'Bad input file');
return true;
}
$dl = $dls->item(0);
$children = $dl->childNodes;
$dt = null;
for ($i = 0; $i < $children->length; $i++) {
try {
$child = $children->item($i);
if ($child->nodeType != XML_ELEMENT_NODE) {
continue;
}
switch (strtolower($child->tagName)) {
case 'dt':
// <dt> nodes contain primary information about a bookmark.
// We can't import the current one just yet though, since
// it may be followed by a <dd>.
if (!empty($dt)) {
// No DD provided
$this->importBookmark($user, $dt);
$dt = null;
}
$dt = $child;
break;
case 'dd':
$dd = $child;
if (!empty($dt)) {
// This <dd> contains a description for the bookmark in
// the preceding <dt> node.
$saved = $this->importBookmark($user, $dt, $dd);
}
$dt = null;
$dd = null;
break;
case 'p':
common_log(LOG_INFO, 'Skipping the <p> in the <dl>.');
break;
default:
common_log(LOG_WARNING,
"Unexpected element $child->tagName ".
" found in import.");
}
} catch (Exception $e) {
common_log(LOG_ERR, $e->getMessage());
$dt = $dd = null;
}
}
if (!empty($dt)) {
// There was a final bookmark without a description.
try {
$this->importBookmark($user, $dt);
} catch (Exception $e) {
common_log(LOG_ERR, $e->getMessage());
}
}
return true;
}
/**
* Import a single bookmark
*
* Takes a <dt>/<dd> pair. The <dt> has a single
* <a> in it with some non-standard attributes.
*
* A <dt><dt><dd> sequence will appear as a <dt> with
* anothe <dt> as a child. We handle this case recursively.
*
* @param User $user User to import data as
* @param DOMElement $dt <dt> element
* @param DOMElement $dd <dd> element
*
* @return Notice imported notice
*/
function importBookmark($user, $dt, $dd = null)
{
$as = $dt->getElementsByTagName('a');
if ($as->length == 0) {
// TRANS: Client exception thrown when a bookmark in an import file is incorrectly formatted.
throw new ClientException(_m("No <A> tag in a <DT>."));
}
$a = $as->item(0);
$private = $a->getAttribute('private');
if ($private != 0) {
// TRANS: Client exception thrown when a bookmark in an import file is private.
throw new ClientException(_m('Skipping private bookmark.'));
}
if (!empty($dd)) {
$description = $dd->nodeValue;
} else {
$description = null;
}
$addDate = $a->getAttribute('add_date');
$data = array(
'profile_id' => $user->id,
'title' => $a->nodeValue,
'description' => $description,
'url' => $a->getAttribute('href'),
'tags' => $a->getAttribute('tags'),
'created' => common_sql_date(intval($addDate))
);
$qm = QueueManager::get();
$qm->enqueue($data, 'dlcsbkmk');
}
/**
* Parse some HTML
*
* Hides the errors that the dom parser returns
*
* @param string $body Data to import
*
* @return DOMDocument parsed document
*/
function importHTML($body)
{
// DOMDocument::loadHTML may throw warnings on unrecognized elements,
// and notices on unrecognized namespaces.
$old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE));
$dom = new DOMDocument();
$ok = $dom->loadHTML($body);
error_reporting($old);
if ($ok) {
foreach ($dom->getElementsByTagName('body') as $node) {
$this->fixListsIn($node);
}
return $dom;
} else {
return null;
}
}
function fixListsIn(DOMNode $body) {
$toFix = array();
foreach ($body->childNodes as $node) {
if ($node->nodeType == XML_ELEMENT_NODE) {
$el = strtolower($node->nodeName);
if ($el == 'dl') {
$toFix[] = $node;
}
}
}
foreach ($toFix as $node) {
$this->fixList($node);
}
}
function fixList(DOMNode $list) {
$toFix = array();
foreach ($list->childNodes as $node) {
if ($node->nodeType == XML_ELEMENT_NODE) {
$el = strtolower($node->nodeName);
if ($el == 'dt' || $el == 'dd') {
$toFix[] = $node;
}
if ($el == 'dl') {
// Sublist.
// Technically, these can only appear inside a <dd>...
$this->fixList($node);
}
}
}
foreach ($toFix as $node) {
$this->fixListItem($node);
}
}
function fixListItem(DOMNode $item) {
// The HTML parser in libxml2 doesn't seem to properly handle
// many cases of implied close tags, apparently because it doesn't
// understand the nesting rules specified in the HTML DTD.
//
// This leads to sequences of adjacent <dt>s or <dd>s being incorrectly
// interpreted as parent->child trees instead of siblings:
//
// When parsing this input: "<dt>aaa <dt>bbb"
// should be equivalent to: "<dt>aaa </dt><dt>bbb</dt>"
// but we're seeing instead: "<dt>aaa <dt>bbb</dt></dt>"
//
// It does at least know that going from dt to dd, or dd to dt,
// should make a break.
$toMove = array();
foreach ($item->childNodes as $node) {
if ($node->nodeType == XML_ELEMENT_NODE) {
$el = strtolower($node->nodeName);
if ($el == 'dt' || $el == 'dd') {
// dt & dd cannot contain each other;
// This node was incorrectly placed; move it up a level!
$toMove[] = $node;
}
if ($el == 'dl') {
// Sublist.
// Technically, these can only appear inside a <dd>.
$this->fixList($node);
}
}
}
$parent = $item->parentNode;
$next = $item->nextSibling;
foreach ($toMove as $node) {
$item->removeChild($node);
$parent->insertBefore($node, $next);
$this->fixListItem($node);
}
}
}