324 lines
9.9 KiB
PHP
324 lines
9.9 KiB
PHP
<?php
|
|
/**
|
|
* StatusNet - the distributed open-source microblogging tool
|
|
* Copyright (C) 2010, StatusNet, Inc.
|
|
*
|
|
* Importer class for Delicious.com backups
|
|
*
|
|
* PHP version 5
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Affero General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*
|
|
* @category Bookmark
|
|
* @package StatusNet
|
|
* @author Evan Prodromou <evan@status.net>
|
|
* @copyright 2010 StatusNet, Inc.
|
|
* @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
|
|
* @link http://status.net/
|
|
*/
|
|
|
|
if (!defined('STATUSNET')) {
|
|
// This check helps protect against security problems;
|
|
// your code file can't be executed directly from the web.
|
|
exit(1);
|
|
}
|
|
|
|
/**
|
|
* Importer class for Delicious bookmarks
|
|
*
|
|
* @category Bookmark
|
|
* @package StatusNet
|
|
* @author Evan Prodromou <evan@status.net>
|
|
* @copyright 2010 StatusNet, Inc.
|
|
* @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
|
|
* @link http://status.net/
|
|
*/
|
|
class DeliciousBackupImporter extends QueueHandler
|
|
{
|
|
/**
|
|
* Transport of the importer
|
|
*
|
|
* @return string transport string
|
|
*/
|
|
function transport()
|
|
{
|
|
return 'dlcsback';
|
|
}
|
|
|
|
/**
|
|
* Import an in-memory bookmark list to a user's account
|
|
*
|
|
* Take a delicious.com backup file (same as Netscape bookmarks.html)
|
|
* and import to StatusNet as Bookmark activities.
|
|
*
|
|
* The document format is terrible. It consists of a <dl> with
|
|
* a bunch of <dt>'s, occasionally with <dd>'s adding descriptions.
|
|
* There are sometimes <p>'s lost inside.
|
|
*
|
|
* @param array $data pair of user, text
|
|
*
|
|
* @return boolean success value
|
|
*/
|
|
function handle($data) : bool
|
|
{
|
|
list($user, $body) = $data;
|
|
|
|
try {
|
|
$doc = $this->importHTML($body);
|
|
} catch (ClientException $cex) {
|
|
// XXX: message to the user
|
|
common_log(LOG_WARNING, $cex->getMessage());
|
|
return true;
|
|
}
|
|
|
|
// If we can't parse it, it's no good
|
|
|
|
if (empty($doc)) {
|
|
return true;
|
|
}
|
|
|
|
$dls = $doc->getElementsByTagName('dl');
|
|
|
|
if ($dls->length != 1) {
|
|
// XXX: message to the user
|
|
common_log(LOG_WARNING, 'Bad input file');
|
|
return true;
|
|
}
|
|
|
|
$dl = $dls->item(0);
|
|
|
|
$children = $dl->childNodes;
|
|
|
|
$dt = null;
|
|
|
|
for ($i = 0; $i < $children->length; $i++) {
|
|
try {
|
|
$child = $children->item($i);
|
|
if ($child->nodeType != XML_ELEMENT_NODE) {
|
|
continue;
|
|
}
|
|
switch (strtolower($child->tagName)) {
|
|
case 'dt':
|
|
// <dt> nodes contain primary information about a bookmark.
|
|
// We can't import the current one just yet though, since
|
|
// it may be followed by a <dd>.
|
|
if (!empty($dt)) {
|
|
// No DD provided
|
|
$this->importBookmark($user, $dt);
|
|
$dt = null;
|
|
}
|
|
$dt = $child;
|
|
break;
|
|
case 'dd':
|
|
$dd = $child;
|
|
|
|
if (!empty($dt)) {
|
|
// This <dd> contains a description for the bookmark in
|
|
// the preceding <dt> node.
|
|
$saved = $this->importBookmark($user, $dt, $dd);
|
|
}
|
|
|
|
$dt = null;
|
|
$dd = null;
|
|
break;
|
|
case 'p':
|
|
common_log(LOG_INFO, 'Skipping the <p> in the <dl>.');
|
|
break;
|
|
default:
|
|
common_log(LOG_WARNING,
|
|
"Unexpected element $child->tagName ".
|
|
" found in import.");
|
|
}
|
|
} catch (Exception $e) {
|
|
common_log(LOG_ERR, $e->getMessage());
|
|
$dt = $dd = null;
|
|
}
|
|
}
|
|
if (!empty($dt)) {
|
|
// There was a final bookmark without a description.
|
|
try {
|
|
$this->importBookmark($user, $dt);
|
|
} catch (Exception $e) {
|
|
common_log(LOG_ERR, $e->getMessage());
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Import a single bookmark
|
|
*
|
|
* Takes a <dt>/<dd> pair. The <dt> has a single
|
|
* <a> in it with some non-standard attributes.
|
|
*
|
|
* A <dt><dt><dd> sequence will appear as a <dt> with
|
|
* anothe <dt> as a child. We handle this case recursively.
|
|
*
|
|
* @param User $user User to import data as
|
|
* @param DOMElement $dt <dt> element
|
|
* @param DOMElement $dd <dd> element
|
|
*
|
|
* @return Notice imported notice
|
|
*/
|
|
function importBookmark($user, $dt, $dd = null)
|
|
{
|
|
$as = $dt->getElementsByTagName('a');
|
|
|
|
if ($as->length == 0) {
|
|
// TRANS: Client exception thrown when a bookmark in an import file is incorrectly formatted.
|
|
throw new ClientException(_m("No <A> tag in a <DT>."));
|
|
}
|
|
|
|
$a = $as->item(0);
|
|
|
|
$private = $a->getAttribute('private');
|
|
|
|
if ($private != 0) {
|
|
// TRANS: Client exception thrown when a bookmark in an import file is private.
|
|
throw new ClientException(_m('Skipping private bookmark.'));
|
|
}
|
|
|
|
if (!empty($dd)) {
|
|
$description = $dd->nodeValue;
|
|
} else {
|
|
$description = null;
|
|
}
|
|
$addDate = $a->getAttribute('add_date');
|
|
|
|
$data = array(
|
|
'profile_id' => $user->id,
|
|
'title' => $a->nodeValue,
|
|
'description' => $description,
|
|
'url' => $a->getAttribute('href'),
|
|
'tags' => preg_split('/[\s,]+/', $a->getAttribute('tags'), null, PREG_SPLIT_NO_EMPTY),
|
|
'created' => common_sql_date(intval($addDate))
|
|
);
|
|
|
|
$qm = QueueManager::get();
|
|
$qm->enqueue($data, 'dlcsbkmk');
|
|
}
|
|
|
|
/**
|
|
* Parse some HTML
|
|
*
|
|
* Hides the errors that the dom parser returns
|
|
*
|
|
* @param string $body Data to import
|
|
*
|
|
* @return DOMDocument parsed document
|
|
*/
|
|
|
|
function importHTML($body)
|
|
{
|
|
// DOMDocument::loadHTML may throw warnings on unrecognized elements,
|
|
// and notices on unrecognized namespaces.
|
|
$old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE));
|
|
$dom = new DOMDocument();
|
|
$ok = $dom->loadHTML($body);
|
|
error_reporting($old);
|
|
|
|
if ($ok) {
|
|
foreach ($dom->getElementsByTagName('body') as $node) {
|
|
$this->fixListsIn($node);
|
|
}
|
|
return $dom;
|
|
} else {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
|
|
function fixListsIn(DOMNode $body) {
|
|
$toFix = array();
|
|
|
|
foreach ($body->childNodes as $node) {
|
|
if ($node->nodeType == XML_ELEMENT_NODE) {
|
|
$el = strtolower($node->nodeName);
|
|
if ($el == 'dl') {
|
|
$toFix[] = $node;
|
|
}
|
|
}
|
|
}
|
|
|
|
foreach ($toFix as $node) {
|
|
$this->fixList($node);
|
|
}
|
|
}
|
|
|
|
function fixList(DOMNode $list) {
|
|
$toFix = array();
|
|
|
|
foreach ($list->childNodes as $node) {
|
|
if ($node->nodeType == XML_ELEMENT_NODE) {
|
|
$el = strtolower($node->nodeName);
|
|
if ($el == 'dt' || $el == 'dd') {
|
|
$toFix[] = $node;
|
|
}
|
|
if ($el == 'dl') {
|
|
// Sublist.
|
|
// Technically, these can only appear inside a <dd>...
|
|
$this->fixList($node);
|
|
}
|
|
}
|
|
}
|
|
|
|
foreach ($toFix as $node) {
|
|
$this->fixListItem($node);
|
|
}
|
|
}
|
|
|
|
function fixListItem(DOMNode $item) {
|
|
// The HTML parser in libxml2 doesn't seem to properly handle
|
|
// many cases of implied close tags, apparently because it doesn't
|
|
// understand the nesting rules specified in the HTML DTD.
|
|
//
|
|
// This leads to sequences of adjacent <dt>s or <dd>s being incorrectly
|
|
// interpreted as parent->child trees instead of siblings:
|
|
//
|
|
// When parsing this input: "<dt>aaa <dt>bbb"
|
|
// should be equivalent to: "<dt>aaa </dt><dt>bbb</dt>"
|
|
// but we're seeing instead: "<dt>aaa <dt>bbb</dt></dt>"
|
|
//
|
|
// It does at least know that going from dt to dd, or dd to dt,
|
|
// should make a break.
|
|
|
|
$toMove = array();
|
|
|
|
foreach ($item->childNodes as $node) {
|
|
if ($node->nodeType == XML_ELEMENT_NODE) {
|
|
$el = strtolower($node->nodeName);
|
|
if ($el == 'dt' || $el == 'dd') {
|
|
// dt & dd cannot contain each other;
|
|
// This node was incorrectly placed; move it up a level!
|
|
$toMove[] = $node;
|
|
}
|
|
if ($el == 'dl') {
|
|
// Sublist.
|
|
// Technically, these can only appear inside a <dd>.
|
|
$this->fixList($node);
|
|
}
|
|
}
|
|
}
|
|
|
|
$parent = $item->parentNode;
|
|
$next = $item->nextSibling;
|
|
foreach ($toMove as $node) {
|
|
$item->removeChild($node);
|
|
$parent->insertBefore($node, $next);
|
|
$this->fixListItem($node);
|
|
}
|
|
}
|
|
}
|