forked from GNUsocial/gnu-social
		
	
		
			
				
	
	
		
			324 lines
		
	
	
		
			9.9 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			324 lines
		
	
	
		
			9.9 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?php
 | |
| /**
 | |
|  * StatusNet - the distributed open-source microblogging tool
 | |
|  * Copyright (C) 2010, StatusNet, Inc.
 | |
|  *
 | |
|  * Importer class for Delicious.com backups
 | |
|  *
 | |
|  * PHP version 5
 | |
|  *
 | |
|  * This program is free software: you can redistribute it and/or modify
 | |
|  * it under the terms of the GNU Affero General Public License as published by
 | |
|  * the Free Software Foundation, either version 3 of the License, or
 | |
|  * (at your option) any later version.
 | |
|  *
 | |
|  * This program is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|  * GNU Affero General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU Affero General Public License
 | |
|  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 | |
|  *
 | |
|  * @category  Bookmark
 | |
|  * @package   StatusNet
 | |
|  * @author    Evan Prodromou <evan@status.net>
 | |
|  * @copyright 2010 StatusNet, Inc.
 | |
|  * @license   http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
 | |
|  * @link      http://status.net/
 | |
|  */
 | |
| 
 | |
| if (!defined('STATUSNET')) {
 | |
|     // This check helps protect against security problems;
 | |
|     // your code file can't be executed directly from the web.
 | |
|     exit(1);
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Importer class for Delicious bookmarks
 | |
|  *
 | |
|  * @category  Bookmark
 | |
|  * @package   StatusNet
 | |
|  * @author    Evan Prodromou <evan@status.net>
 | |
|  * @copyright 2010 StatusNet, Inc.
 | |
|  * @license   http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
 | |
|  * @link      http://status.net/
 | |
|  */
 | |
| class DeliciousBackupImporter extends QueueHandler
 | |
| {
 | |
|     /**
 | |
|      * Transport of the importer
 | |
|      *
 | |
|      * @return string transport string
 | |
|      */
 | |
|     function transport()
 | |
|     {
 | |
|         return 'dlcsback';
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Import an in-memory bookmark list to a user's account
 | |
|      *
 | |
|      * Take a delicious.com backup file (same as Netscape bookmarks.html)
 | |
|      * and import to StatusNet as Bookmark activities.
 | |
|      *
 | |
|      * The document format is terrible. It consists of a <dl> with
 | |
|      * a bunch of <dt>'s, occasionally with <dd>'s adding descriptions.
 | |
|      * There are sometimes <p>'s lost inside.
 | |
|      *
 | |
|      * @param array $data pair of user, text
 | |
|      *
 | |
|      * @return boolean success value
 | |
|      */
 | |
|     function handle($data)
 | |
|     {
 | |
|         list($user, $body) = $data;
 | |
| 
 | |
|         try {
 | |
|             $doc = $this->importHTML($body);
 | |
|         } catch (ClientException $cex) {
 | |
|             // XXX: message to the user
 | |
|             common_log(LOG_WARNING, $cex->getMessage());
 | |
|             return true;
 | |
|         }
 | |
| 
 | |
|         // If we can't parse it, it's no good
 | |
| 
 | |
|         if (empty($doc)) {
 | |
|             return true;
 | |
|         }
 | |
| 
 | |
|         $dls = $doc->getElementsByTagName('dl');
 | |
| 
 | |
|         if ($dls->length != 1) {
 | |
|             // XXX: message to the user
 | |
|             common_log(LOG_WARNING, 'Bad input file');
 | |
|             return true;
 | |
|         }
 | |
| 
 | |
|         $dl = $dls->item(0);
 | |
| 
 | |
|         $children = $dl->childNodes;
 | |
| 
 | |
|         $dt = null;
 | |
| 
 | |
|         for ($i = 0; $i < $children->length; $i++) {
 | |
|             try {
 | |
|                 $child = $children->item($i);
 | |
|                 if ($child->nodeType != XML_ELEMENT_NODE) {
 | |
|                     continue;
 | |
|                 }
 | |
|                 switch (strtolower($child->tagName)) {
 | |
|                 case 'dt':
 | |
|                     // <dt> nodes contain primary information about a bookmark.
 | |
|                     // We can't import the current one just yet though, since
 | |
|                     // it may be followed by a <dd>.
 | |
|                     if (!empty($dt)) {
 | |
|                         // No DD provided
 | |
|                         $this->importBookmark($user, $dt);
 | |
|                         $dt = null;
 | |
|                     }
 | |
|                     $dt = $child;
 | |
|                     break;
 | |
|                 case 'dd':
 | |
|                     $dd = $child;
 | |
| 
 | |
|                     if (!empty($dt)) {
 | |
|                         // This <dd> contains a description for the bookmark in
 | |
|                         // the preceding <dt> node.
 | |
|                         $saved = $this->importBookmark($user, $dt, $dd);
 | |
|                     }
 | |
| 
 | |
|                     $dt = null;
 | |
|                     $dd = null;
 | |
|                     break;
 | |
|                 case 'p':
 | |
|                     common_log(LOG_INFO, 'Skipping the <p> in the <dl>.');
 | |
|                     break;
 | |
|                 default:
 | |
|                     common_log(LOG_WARNING,
 | |
|                                "Unexpected element $child->tagName ".
 | |
|                                " found in import.");
 | |
|                 }
 | |
|             } catch (Exception $e) {
 | |
|                 common_log(LOG_ERR, $e->getMessage());
 | |
|                 $dt = $dd = null;
 | |
|             }
 | |
|         }
 | |
|         if (!empty($dt)) {
 | |
|             // There was a final bookmark without a description.
 | |
|             try {
 | |
|                 $this->importBookmark($user, $dt);
 | |
|             } catch (Exception $e) {
 | |
|                 common_log(LOG_ERR, $e->getMessage());
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         return true;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Import a single bookmark
 | |
|      *
 | |
|      * Takes a <dt>/<dd> pair. The <dt> has a single
 | |
|      * <a> in it with some non-standard attributes.
 | |
|      *
 | |
|      * A <dt><dt><dd> sequence will appear as a <dt> with
 | |
|      * anothe <dt> as a child. We handle this case recursively.
 | |
|      *
 | |
|      * @param User       $user User to import data as
 | |
|      * @param DOMElement $dt   <dt> element
 | |
|      * @param DOMElement $dd   <dd> element
 | |
|      *
 | |
|      * @return Notice imported notice
 | |
|      */
 | |
|     function importBookmark($user, $dt, $dd = null)
 | |
|     {
 | |
|         $as = $dt->getElementsByTagName('a');
 | |
| 
 | |
|         if ($as->length == 0) {
 | |
|             // TRANS: Client exception thrown when a bookmark in an import file is incorrectly formatted.
 | |
|             throw new ClientException(_m("No <A> tag in a <DT>."));
 | |
|         }
 | |
| 
 | |
|         $a = $as->item(0);
 | |
| 
 | |
|         $private = $a->getAttribute('private');
 | |
| 
 | |
|         if ($private != 0) {
 | |
|             // TRANS: Client exception thrown when a bookmark in an import file is private.
 | |
|             throw new ClientException(_m('Skipping private bookmark.'));
 | |
|         }
 | |
| 
 | |
|         if (!empty($dd)) {
 | |
|             $description = $dd->nodeValue;
 | |
|         } else {
 | |
|             $description = null;
 | |
|         }
 | |
|         $addDate = $a->getAttribute('add_date');
 | |
| 
 | |
|         $data = array(
 | |
|             'profile_id' => $user->id,
 | |
|             'title' => $a->nodeValue,
 | |
|             'description' => $description,
 | |
|             'url' => $a->getAttribute('href'),
 | |
|             'tags' => preg_split('/[\s,]+/', $a->getAttribute('tags'), null,  PREG_SPLIT_NO_EMPTY),
 | |
|             'created' => common_sql_date(intval($addDate))
 | |
|         );
 | |
| 
 | |
|         $qm = QueueManager::get();
 | |
|         $qm->enqueue($data, 'dlcsbkmk');
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Parse some HTML
 | |
|      *
 | |
|      * Hides the errors that the dom parser returns
 | |
|      *
 | |
|      * @param string $body Data to import
 | |
|      *
 | |
|      * @return DOMDocument parsed document
 | |
|      */
 | |
| 
 | |
|     function importHTML($body)
 | |
|     {
 | |
|         // DOMDocument::loadHTML may throw warnings on unrecognized elements,
 | |
|         // and notices on unrecognized namespaces.
 | |
|         $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE));
 | |
|         $dom = new DOMDocument();
 | |
|         $ok  = $dom->loadHTML($body);
 | |
|         error_reporting($old);
 | |
| 
 | |
|         if ($ok) {
 | |
|             foreach ($dom->getElementsByTagName('body') as $node) {
 | |
|                 $this->fixListsIn($node);
 | |
|             }
 | |
|             return $dom;
 | |
|         } else {
 | |
|             return null;
 | |
|         }
 | |
|     }
 | |
| 
 | |
| 
 | |
|     function fixListsIn(DOMNode $body) {
 | |
|         $toFix = array();
 | |
| 
 | |
|         foreach ($body->childNodes as $node) {
 | |
|             if ($node->nodeType == XML_ELEMENT_NODE) {
 | |
|                 $el = strtolower($node->nodeName);
 | |
|                 if ($el == 'dl') {
 | |
|                     $toFix[] = $node;
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         foreach ($toFix as $node) {
 | |
|             $this->fixList($node);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     function fixList(DOMNode $list) {
 | |
|         $toFix = array();
 | |
| 
 | |
|         foreach ($list->childNodes as $node) {
 | |
|             if ($node->nodeType == XML_ELEMENT_NODE) {
 | |
|                 $el = strtolower($node->nodeName);
 | |
|                 if ($el == 'dt' || $el == 'dd') {
 | |
|                     $toFix[] = $node;
 | |
|                 }
 | |
|                 if ($el == 'dl') {
 | |
|                     // Sublist.
 | |
|                     // Technically, these can only appear inside a <dd>...
 | |
|                     $this->fixList($node);
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         foreach ($toFix as $node) {
 | |
|             $this->fixListItem($node);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     function fixListItem(DOMNode $item) {
 | |
|         // The HTML parser in libxml2 doesn't seem to properly handle
 | |
|         // many cases of implied close tags, apparently because it doesn't
 | |
|         // understand the nesting rules specified in the HTML DTD.
 | |
|         //
 | |
|         // This leads to sequences of adjacent <dt>s or <dd>s being incorrectly
 | |
|         // interpreted as parent->child trees instead of siblings:
 | |
|         //
 | |
|         // When parsing this input: "<dt>aaa <dt>bbb"
 | |
|         // should be equivalent to: "<dt>aaa </dt><dt>bbb</dt>"
 | |
|         // but we're seeing instead: "<dt>aaa <dt>bbb</dt></dt>"
 | |
|         //
 | |
|         // It does at least know that going from dt to dd, or dd to dt,
 | |
|         // should make a break.
 | |
| 
 | |
|         $toMove = array();
 | |
| 
 | |
|         foreach ($item->childNodes as $node) {
 | |
|             if ($node->nodeType == XML_ELEMENT_NODE) {
 | |
|                 $el = strtolower($node->nodeName);
 | |
|                 if ($el == 'dt' || $el == 'dd') {
 | |
|                     // dt & dd cannot contain each other;
 | |
|                     // This node was incorrectly placed; move it up a level!
 | |
|                     $toMove[] = $node;
 | |
|                 }
 | |
|                 if ($el == 'dl') {
 | |
|                     // Sublist.
 | |
|                     // Technically, these can only appear inside a <dd>.
 | |
|                     $this->fixList($node);
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         $parent = $item->parentNode;
 | |
|         $next = $item->nextSibling;
 | |
|         foreach ($toMove as $node) {
 | |
|             $item->removeChild($node);
 | |
|             $parent->insertBefore($node, $next);
 | |
|             $this->fixListItem($node);
 | |
|         }
 | |
|     }
 | |
| }
 |