<?php /** * StatusNet - the distributed open-source microblogging tool * Copyright (C) 2010, StatusNet, Inc. * * Importer class for Delicious.com backups * * PHP version 5 * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * @category Bookmark * @package StatusNet * @author Evan Prodromou <evan@status.net> * @copyright 2010 StatusNet, Inc. * @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0 * @link http://status.net/ */ if (!defined('STATUSNET')) { // This check helps protect against security problems; // your code file can't be executed directly from the web. exit(1); } /** * Importer class for Delicious bookmarks * * @category Bookmark * @package StatusNet * @author Evan Prodromou <evan@status.net> * @copyright 2010 StatusNet, Inc. * @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0 * @link http://status.net/ */ class DeliciousBackupImporter extends QueueHandler { /** * Transport of the importer * * @return string transport string */ function transport() { return 'dlcsback'; } /** * Import an in-memory bookmark list to a user's account * * Take a delicious.com backup file (same as Netscape bookmarks.html) * and import to StatusNet as Bookmark activities. * * The document format is terrible. It consists of a <dl> with * a bunch of <dt>'s, occasionally with <dd>'s adding descriptions. * There are sometimes <p>'s lost inside. * * @param array $data pair of user, text * * @return boolean success value */ function handle($data) { list($user, $body) = $data; try { $doc = $this->importHTML($body); } catch (ClientException $cex) { // XXX: message to the user common_log(LOG_WARNING, $cex->getMessage()); return true; } // If we can't parse it, it's no good if (empty($doc)) { return true; } $dls = $doc->getElementsByTagName('dl'); if ($dls->length != 1) { // XXX: message to the user common_log(LOG_WARNING, 'Bad input file'); return true; } $dl = $dls->item(0); $children = $dl->childNodes; $dt = null; for ($i = 0; $i < $children->length; $i++) { try { $child = $children->item($i); if ($child->nodeType != XML_ELEMENT_NODE) { continue; } switch (strtolower($child->tagName)) { case 'dt': // <dt> nodes contain primary information about a bookmark. // We can't import the current one just yet though, since // it may be followed by a <dd>. if (!empty($dt)) { // No DD provided $this->importBookmark($user, $dt); $dt = null; } $dt = $child; break; case 'dd': $dd = $child; if (!empty($dt)) { // This <dd> contains a description for the bookmark in // the preceding <dt> node. $saved = $this->importBookmark($user, $dt, $dd); } $dt = null; $dd = null; break; case 'p': common_log(LOG_INFO, 'Skipping the <p> in the <dl>.'); break; default: common_log(LOG_WARNING, "Unexpected element $child->tagName ". " found in import."); } } catch (Exception $e) { common_log(LOG_ERR, $e->getMessage()); $dt = $dd = null; } } if (!empty($dt)) { // There was a final bookmark without a description. try { $this->importBookmark($user, $dt); } catch (Exception $e) { common_log(LOG_ERR, $e->getMessage()); } } return true; } /** * Import a single bookmark * * Takes a <dt>/<dd> pair. The <dt> has a single * <a> in it with some non-standard attributes. * * A <dt><dt><dd> sequence will appear as a <dt> with * anothe <dt> as a child. We handle this case recursively. * * @param User $user User to import data as * @param DOMElement $dt <dt> element * @param DOMElement $dd <dd> element * * @return Notice imported notice */ function importBookmark($user, $dt, $dd = null) { $as = $dt->getElementsByTagName('a'); if ($as->length == 0) { // TRANS: Client exception thrown when a bookmark in an import file is incorrectly formatted. throw new ClientException(_m("No <A> tag in a <DT>.")); } $a = $as->item(0); $private = $a->getAttribute('private'); if ($private != 0) { // TRANS: Client exception thrown when a bookmark in an import file is private. throw new ClientException(_m('Skipping private bookmark.')); } if (!empty($dd)) { $description = $dd->nodeValue; } else { $description = null; } $addDate = $a->getAttribute('add_date'); $data = array( 'profile_id' => $user->id, 'title' => $a->nodeValue, 'description' => $description, 'url' => $a->getAttribute('href'), 'tags' => $a->getAttribute('tags'), 'created' => common_sql_date(intval($addDate)) ); $qm = QueueManager::get(); $qm->enqueue($data, 'dlcsbkmk'); } /** * Parse some HTML * * Hides the errors that the dom parser returns * * @param string $body Data to import * * @return DOMDocument parsed document */ function importHTML($body) { // DOMDocument::loadHTML may throw warnings on unrecognized elements, // and notices on unrecognized namespaces. $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE)); $dom = new DOMDocument(); $ok = $dom->loadHTML($body); error_reporting($old); if ($ok) { foreach ($dom->getElementsByTagName('body') as $node) { $this->fixListsIn($node); } return $dom; } else { return null; } } function fixListsIn(DOMNode $body) { $toFix = array(); foreach ($body->childNodes as $node) { if ($node->nodeType == XML_ELEMENT_NODE) { $el = strtolower($node->nodeName); if ($el == 'dl') { $toFix[] = $node; } } } foreach ($toFix as $node) { $this->fixList($node); } } function fixList(DOMNode $list) { $toFix = array(); foreach ($list->childNodes as $node) { if ($node->nodeType == XML_ELEMENT_NODE) { $el = strtolower($node->nodeName); if ($el == 'dt' || $el == 'dd') { $toFix[] = $node; } if ($el == 'dl') { // Sublist. // Technically, these can only appear inside a <dd>... $this->fixList($node); } } } foreach ($toFix as $node) { $this->fixListItem($node); } } function fixListItem(DOMNode $item) { // The HTML parser in libxml2 doesn't seem to properly handle // many cases of implied close tags, apparently because it doesn't // understand the nesting rules specified in the HTML DTD. // // This leads to sequences of adjacent <dt>s or <dd>s being incorrectly // interpreted as parent->child trees instead of siblings: // // When parsing this input: "<dt>aaa <dt>bbb" // should be equivalent to: "<dt>aaa </dt><dt>bbb</dt>" // but we're seeing instead: "<dt>aaa <dt>bbb</dt></dt>" // // It does at least know that going from dt to dd, or dd to dt, // should make a break. $toMove = array(); foreach ($item->childNodes as $node) { if ($node->nodeType == XML_ELEMENT_NODE) { $el = strtolower($node->nodeName); if ($el == 'dt' || $el == 'dd') { // dt & dd cannot contain each other; // This node was incorrectly placed; move it up a level! $toMove[] = $node; } if ($el == 'dl') { // Sublist. // Technically, these can only appear inside a <dd>. $this->fixList($node); } } } $parent = $item->parentNode; $next = $item->nextSibling; foreach ($toMove as $node) { $item->removeChild($node); $parent->insertBefore($node, $next); $this->fixListItem($node); } } }