. * * @category Bookmark * @package StatusNet * @author Evan Prodromou * @copyright 2010 StatusNet, Inc. * @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0 * @link http://status.net/ */ if (!defined('STATUSNET')) { // This check helps protect against security problems; // your code file can't be executed directly from the web. exit(1); } /** * Importer class for Delicious bookmarks * * @category Bookmark * @package StatusNet * @author Evan Prodromou * @copyright 2010 StatusNet, Inc. * @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0 * @link http://status.net/ */ class DeliciousBackupImporter extends QueueHandler { /** * Transport of the importer * * @return string transport string */ function transport() { return 'dlcsback'; } /** * Import an in-memory bookmark list to a user's account * * Take a delicious.com backup file (same as Netscape bookmarks.html) * and import to StatusNet as Bookmark activities. * * The document format is terrible. It consists of a
with * a bunch of
's, occasionally with
's adding descriptions. * There are sometimes

's lost inside. * * @param array $data pair of user, text * * @return boolean success value */ function handle($data) { list($user, $body) = $data; $doc = $this->importHTML($body); $dls = $doc->getElementsByTagName('dl'); if ($dls->length != 1) { throw new ClientException(_m("Bad import file.")); } $dl = $dls->item(0); $children = $dl->childNodes; $dt = null; for ($i = 0; $i < $children->length; $i++) { try { $child = $children->item($i); if ($child->nodeType != XML_ELEMENT_NODE) { continue; } switch (strtolower($child->tagName)) { case 'dt': //

nodes contain primary information about a bookmark. // We can't import the current one just yet though, since // it may be followed by a
. if (!empty($dt)) { // No DD provided $this->importBookmark($user, $dt); $dt = null; } $dt = $child; break; case 'dd': $dd = $child; // This
contains a description for the bookmark in // the preceding
node. $saved = $this->importBookmark($user, $dt, $dd); $dt = null; $dd = null; break; case 'p': common_log(LOG_INFO, 'Skipping the

in the

.'); break; default: common_log(LOG_WARNING, "Unexpected element $child->tagName ". " found in import."); } } catch (Exception $e) { common_log(LOG_ERR, $e->getMessage()); $dt = $dd = null; } } if (!empty($dt)) { // There was a final bookmark without a description. try { $this->importBookmark($user, $dt); } catch (Exception $e) { common_log(LOG_ERR, $e->getMessage()); } } return true; } /** * Import a single bookmark * * Takes a
/
pair. The
has a single * in it with some non-standard attributes. * * A
sequence will appear as a
with * anothe
as a child. We handle this case recursively. * * @param User $user User to import data as * @param DOMElement $dt
element * @param DOMElement $dd
element * * @return Notice imported notice */ function importBookmark($user, $dt, $dd = null) { $as = $dt->getElementsByTagName('a'); if ($as->length == 0) { throw new ClientException(_m("No tag in a
.")); } $a = $as->item(0); $private = $a->getAttribute('private'); if ($private != 0) { throw new ClientException(_m('Skipping private bookmark.')); } if (!empty($dd)) { $description = $dd->nodeValue; } else { $description = null; } $addDate = $a->getAttribute('add_date'); $data = array( 'profile_id' => $user->id, 'title' => $a->nodeValue, 'description' => $description, 'url' => $a->getAttribute('href'), 'tags' => $a->getAttribute('tags'), 'created' => common_sql_date(intval($addDate)) ); $qm = QueueManager::get(); $qm->enqueue($data, 'dlcsbkmk'); } /** * Parse some HTML * * Hides the errors that the dom parser returns * * @param string $body Data to import * * @return DOMDocument parsed document */ function importHTML($body) { // DOMDocument::loadHTML may throw warnings on unrecognized elements, // and notices on unrecognized namespaces. $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE)); $dom = new DOMDocument(); $ok = $dom->loadHTML($body); error_reporting($old); if ($ok) { foreach ($dom->getElementsByTagName('body') as $node) { $this->fixListsIn($node); } return $dom; } else { return null; } } function fixListsIn(DOMNode $body) { $toFix = array(); foreach ($body->childNodes as $node) { if ($node->nodeType == XML_ELEMENT_NODE) { $el = strtolower($node->nodeName); if ($el == 'dl') { $toFix[] = $node; } } } foreach ($toFix as $node) { $this->fixList($node); } } function fixList(DOMNode $list) { $toFix = array(); foreach ($list->childNodes as $node) { if ($node->nodeType == XML_ELEMENT_NODE) { $el = strtolower($node->nodeName); if ($el == 'dt' || $el == 'dd') { $toFix[] = $node; } if ($el == 'dl') { // Sublist. // Technically, these can only appear inside a
... $this->fixList($node); } } } foreach ($toFix as $node) { $this->fixListItem($node); } } function fixListItem(DOMNode $item) { // The HTML parser in libxml2 doesn't seem to properly handle // many cases of implied close tags, apparently because it doesn't // understand the nesting rules specified in the HTML DTD. // // This leads to sequences of adjacent
s or
s being incorrectly // interpreted as parent->child trees instead of siblings: // // When parsing this input: "
aaa
bbb" // should be equivalent to: "
aaa
bbb
" // but we're seeing instead: "
aaa
bbb
" // // It does at least know that going from dt to dd, or dd to dt, // should make a break. $toMove = array(); foreach ($item->childNodes as $node) { if ($node->nodeType == XML_ELEMENT_NODE) { $el = strtolower($node->nodeName); if ($el == 'dt' || $el == 'dd') { // dt & dd cannot contain each other; // This node was incorrectly placed; move it up a level! $toMove[] = $node; } if ($el == 'dl') { // Sublist. // Technically, these can only appear inside a
. $this->fixList($node); } } } $parent = $item->parentNode; $next = $item->nextSibling; foreach ($toMove as $node) { $item->removeChild($node); $parent->insertBefore($node, $next); $this->fixListItem($node); } } }