Separate Netscape bookmark parser and add a license header.
This commit is contained in:
parent
0600f02a13
commit
e567e30ce0
149
src/SemanticScuttle/parse_netscape_bookmarks.php
Normal file
149
src/SemanticScuttle/parse_netscape_bookmarks.php
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
<?php
|
||||||
|
/***************************************************************************
|
||||||
|
The MIT License (MIT)
|
||||||
|
|
||||||
|
Copyright (c) 2015 kafene
|
||||||
|
https://github.com/kafene/netscape-bookmark-parser
|
||||||
|
http://kafene.org
|
||||||
|
|
||||||
|
Slightly modified by yohan.
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Basically netscape bookmark files often come so badly formed, there's
|
||||||
|
* no reliable way I could find to parse them with DOM or SimpleXML,
|
||||||
|
* even after running HTML Tidy on them. So, this function does a bunch of
|
||||||
|
* transformations on the general format of a netscape bookmark file, to get
|
||||||
|
* Each bookmark and its description onto one line, and goes through line by
|
||||||
|
* line, matching tags and attributes. It's messy, but it works better than
|
||||||
|
* anything I could find in hours of googling, and anything that I could
|
||||||
|
* write after hours with DOM and SimpleXML. I didn't want to pull in a big
|
||||||
|
* DOM parsing library just to do this one thing, so this is it.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
print '<PRE>';
|
||||||
|
var_dump(parse_netscape_bookmarks(file_get_contents('bookmarks_export.htm')));
|
||||||
|
*/
|
||||||
|
|
||||||
|
function parse_netscape_bookmarks($bkmk_str, $default_tag = null) {
|
||||||
|
$i = 0;
|
||||||
|
$next = false;
|
||||||
|
$items = [];
|
||||||
|
|
||||||
|
$current_tag = $default_tag = $default_tag ?: 'autoimported-'.date("Ymd");
|
||||||
|
|
||||||
|
$bkmk_str = str_replace(["\r","\n","\t"], ['','',' '], $bkmk_str);
|
||||||
|
|
||||||
|
$bkmk_str = preg_replace_callback('@<dd>(.*?)(<A|<\/|<DL|<DT|<P)@mis', function($m) {
|
||||||
|
return '<dd>'.str_replace(["\r", "\n"], ['', '<br>'], trim($m[1])).'</';
|
||||||
|
}, $bkmk_str);
|
||||||
|
|
||||||
|
$bkmk_str = preg_replace('/>(\s*?)</mis', ">\n<", $bkmk_str);
|
||||||
|
$bkmk_str = preg_replace('/(<!DOCTYPE|<META|<!--|<TITLE|<H1|<P)(.*?)\n/i', '', $bkmk_str);
|
||||||
|
|
||||||
|
$bkmk_str = trim($bkmk_str);
|
||||||
|
$bkmk_str = preg_replace('/\n<dd/i', '<dd', $bkmk_str);
|
||||||
|
//best way to do it :
|
||||||
|
$bkmk_str = preg_replace('/(?<=.)<\/DL>/', "\n</DL>", $bkmk_str);
|
||||||
|
$lines = explode("\n", $bkmk_str);
|
||||||
|
|
||||||
|
$str_bool = function($str, $default = false) {
|
||||||
|
if (!$str) {
|
||||||
|
return false;
|
||||||
|
} elseif (!is_string($str) && $str) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
$true = 'y|yes|on|checked|ok|1|true|array|\+|okay|yes+|t|one';
|
||||||
|
$false = 'n|no|off|empty|null|false|0|-|exit|die|neg|f|zero|void';
|
||||||
|
|
||||||
|
if (preg_match("/^($true)$/i", $str)) {
|
||||||
|
return true;
|
||||||
|
} elseif (preg_match("/^($false)$/i", $str)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $default;
|
||||||
|
};
|
||||||
|
$tags = array($default_tag);
|
||||||
|
foreach ($lines as $line_no => $line) {
|
||||||
|
/* If we match a tag, set current tag to that, if <DL>, stop tag. */
|
||||||
|
if (preg_match('/^<h\d(.*?)>(.*?)<\/h\d>/i', $line, $m1)) {
|
||||||
|
$current_tag = trim(preg_replace("/\s+/", "_", strtr($m1[2], ', /+', '____')));
|
||||||
|
$tags[] = $current_tag;
|
||||||
|
continue;
|
||||||
|
} elseif (preg_match('/^<\/DL>/i', $line)) {
|
||||||
|
$current_tag = $default_tag;
|
||||||
|
array_pop($tags);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (preg_match('/<a/i', $line, $m2)) {
|
||||||
|
$items[$i]['tags'] = $tags;
|
||||||
|
|
||||||
|
if (preg_match('/href="(.*?)"/i', $line, $m3)) {
|
||||||
|
$items[$i]['uri'] = $m3[1];
|
||||||
|
// $items[$i]['meta'] = meta($m3[1]);
|
||||||
|
} else {
|
||||||
|
$items[$i]['uri'] = '';
|
||||||
|
// $items[$i]['meta'] = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (preg_match('/<a(.*?)>(.*?)<\/a>/i', $line, $m4)) {
|
||||||
|
$items[$i]['title'] = $m4[2];
|
||||||
|
// $items[$i]['slug'] = slugify($m4[2]);
|
||||||
|
} else {
|
||||||
|
$items[$i]['title'] = 'untitled';
|
||||||
|
// $items[$i]['slug'] = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (preg_match('/note="(.*?)"<\/a>/i', $line, $m5)) {
|
||||||
|
$items[$i]['note'] = $m5[1];
|
||||||
|
} elseif (preg_match('/<dd>(.*?)<\//i', $line, $m6)) {
|
||||||
|
$items[$i]['note'] = str_replace('<br>', "\n", $m6[1]);
|
||||||
|
} else {
|
||||||
|
$items[$i]['note'] = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (preg_match('/(tags?|labels?|folders?)="(.*?)"/i', $line, $m7)) {
|
||||||
|
array_unique(array_merge($items[$i]['tags'], explode(' ', trim(preg_replace("/\s+/", " ", strtr($m7[2], ',', ' '))))));
|
||||||
|
}
|
||||||
|
if (preg_match('/add_date="(.*?)"/i', $line, $m8)) {
|
||||||
|
$items[$i]['time'] = $m8[1];
|
||||||
|
} else {
|
||||||
|
$items[$i]['time'] = time();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (preg_match('/(public|published|pub)="(.*?)"/i', $line, $m9)) {
|
||||||
|
$items[$i]['pub'] = $str_bool($m9[2], false) ? 1 : 0;
|
||||||
|
} elseif (preg_match('/(private|shared)="(.*?)"/i', $line, $m10)) {
|
||||||
|
$items[$i]['pub'] = $str_bool($m10[2], true) ? 0 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
$i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ksort($items);
|
||||||
|
|
||||||
|
return $items;
|
||||||
|
}
|
||||||
|
|
||||||
|
?>
|
@ -20,6 +20,13 @@
|
|||||||
***************************************************************************/
|
***************************************************************************/
|
||||||
|
|
||||||
require_once 'www-header.php';
|
require_once 'www-header.php';
|
||||||
|
if ('@data_dir@' == '@' . 'data_dir@') {
|
||||||
|
//non pear-install
|
||||||
|
require_once dirname(__FILE__) . '/../src/SemanticScuttle/parse_netscape_bookmarks.php';
|
||||||
|
} else {
|
||||||
|
//pear installation; files are in include path
|
||||||
|
require_once 'SemanticScuttle/parse_netscape_bookmarks.php';
|
||||||
|
}
|
||||||
|
|
||||||
/* Service creation: only useful services are created */
|
/* Service creation: only useful services are created */
|
||||||
$bookmarkservice =SemanticScuttle_Service_Factory::get('Bookmark');
|
$bookmarkservice =SemanticScuttle_Service_Factory::get('Bookmark');
|
||||||
@ -101,130 +108,4 @@ if ($userservice->isLoggedOn() && sizeof($_FILES) > 0 && $_FILES['userfile']['si
|
|||||||
$templateservice->loadTemplate($templatename, $tplVars);
|
$templateservice->loadTemplate($templatename, $tplVars);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Basically netscape bookmark files often come so badly formed, there's
|
|
||||||
* no reliable way I could find to parse them with DOM or SimpleXML,
|
|
||||||
* even after running HTML Tidy on them. So, this function does a bunch of
|
|
||||||
* transformations on the general format of a netscape bookmark file, to get
|
|
||||||
* Each bookmark and its description onto one line, and goes through line by
|
|
||||||
* line, matching tags and attributes. It's messy, but it works better than
|
|
||||||
* anything I could find in hours of googling, and anything that I could
|
|
||||||
* write after hours with DOM and SimpleXML. I didn't want to pull in a big
|
|
||||||
* DOM parsing library just to do this one thing, so this is it.
|
|
||||||
* @todo - running Tidy before doing this might be beneficial.
|
|
||||||
* ?? $bkmk_str = tidy_parse_string($bkmk_str)->cleanRepair();
|
|
||||||
*
|
|
||||||
* Update 2013-07-08:
|
|
||||||
* Just tested this on an export of some bookmarks from Pinboard.in
|
|
||||||
* and it seems that it is still working, so good for me.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
print '<PRE>';
|
|
||||||
var_dump(parse_netscape_bookmarks(file_get_contents('bookmarks_export.htm')));
|
|
||||||
*/
|
|
||||||
|
|
||||||
function parse_netscape_bookmarks($bkmk_str, $default_tag = null) {
|
|
||||||
$i = 0;
|
|
||||||
$next = false;
|
|
||||||
$items = [];
|
|
||||||
|
|
||||||
$current_tag = $default_tag = $default_tag ?: 'autoimported-'.date("Ymd");
|
|
||||||
|
|
||||||
$bkmk_str = str_replace(["\r","\n","\t"], ['','',' '], $bkmk_str);
|
|
||||||
|
|
||||||
$bkmk_str = preg_replace_callback('@<dd>(.*?)(<A|<\/|<DL|<DT|<P)@mis', function($m) {
|
|
||||||
return '<dd>'.str_replace(["\r", "\n"], ['', '<br>'], trim($m[1])).'</';
|
|
||||||
}, $bkmk_str);
|
|
||||||
|
|
||||||
$bkmk_str = preg_replace('/>(\s*?)</mis', ">\n<", $bkmk_str);
|
|
||||||
$bkmk_str = preg_replace('/(<!DOCTYPE|<META|<!--|<TITLE|<H1|<P)(.*?)\n/i', '', $bkmk_str);
|
|
||||||
|
|
||||||
$bkmk_str = trim($bkmk_str);
|
|
||||||
$bkmk_str = preg_replace('/\n<dd/i', '<dd', $bkmk_str);
|
|
||||||
//best way to do it :
|
|
||||||
$bkmk_str = preg_replace('/(?<=.)<\/DL>/', "\n</DL>", $bkmk_str);
|
|
||||||
$lines = explode("\n", $bkmk_str);
|
|
||||||
|
|
||||||
$str_bool = function($str, $default = false) {
|
|
||||||
if (!$str) {
|
|
||||||
return false;
|
|
||||||
} elseif (!is_string($str) && $str) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
$true = 'y|yes|on|checked|ok|1|true|array|\+|okay|yes+|t|one';
|
|
||||||
$false = 'n|no|off|empty|null|false|0|-|exit|die|neg|f|zero|void';
|
|
||||||
|
|
||||||
if (preg_match("/^($true)$/i", $str)) {
|
|
||||||
return true;
|
|
||||||
} elseif (preg_match("/^($false)$/i", $str)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return $default;
|
|
||||||
};
|
|
||||||
$tags = array($default_tag);
|
|
||||||
foreach ($lines as $line_no => $line) {
|
|
||||||
/* If we match a tag, set current tag to that, if <DL>, stop tag. */
|
|
||||||
if (preg_match('/^<h\d(.*?)>(.*?)<\/h\d>/i', $line, $m1)) {
|
|
||||||
$current_tag = trim(preg_replace("/\s+/", "_", strtr($m1[2], ', /+', '____')));
|
|
||||||
$tags[] = $current_tag;
|
|
||||||
continue;
|
|
||||||
} elseif (preg_match('/^<\/DL>/i', $line)) {
|
|
||||||
$current_tag = $default_tag;
|
|
||||||
array_pop($tags);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (preg_match('/<a/i', $line, $m2)) {
|
|
||||||
$items[$i]['tags'] = $tags;
|
|
||||||
|
|
||||||
if (preg_match('/href="(.*?)"/i', $line, $m3)) {
|
|
||||||
$items[$i]['uri'] = $m3[1];
|
|
||||||
// $items[$i]['meta'] = meta($m3[1]);
|
|
||||||
} else {
|
|
||||||
$items[$i]['uri'] = '';
|
|
||||||
// $items[$i]['meta'] = '';
|
|
||||||
}
|
|
||||||
|
|
||||||
if (preg_match('/<a(.*?)>(.*?)<\/a>/i', $line, $m4)) {
|
|
||||||
$items[$i]['title'] = $m4[2];
|
|
||||||
// $items[$i]['slug'] = slugify($m4[2]);
|
|
||||||
} else {
|
|
||||||
$items[$i]['title'] = 'untitled';
|
|
||||||
// $items[$i]['slug'] = '';
|
|
||||||
}
|
|
||||||
|
|
||||||
if (preg_match('/note="(.*?)"<\/a>/i', $line, $m5)) {
|
|
||||||
$items[$i]['note'] = $m5[1];
|
|
||||||
} elseif (preg_match('/<dd>(.*?)<\//i', $line, $m6)) {
|
|
||||||
$items[$i]['note'] = str_replace('<br>', "\n", $m6[1]);
|
|
||||||
} else {
|
|
||||||
$items[$i]['note'] = '';
|
|
||||||
}
|
|
||||||
|
|
||||||
if (preg_match('/(tags?|labels?|folders?)="(.*?)"/i', $line, $m7)) {
|
|
||||||
array_unique(array_merge($items[$i]['tags'], explode(' ', trim(preg_replace("/\s+/", " ", strtr($m7[2], ',', ' '))))));
|
|
||||||
}
|
|
||||||
if (preg_match('/add_date="(.*?)"/i', $line, $m8)) {
|
|
||||||
$items[$i]['time'] = $m8[1];
|
|
||||||
} else {
|
|
||||||
$items[$i]['time'] = time();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (preg_match('/(public|published|pub)="(.*?)"/i', $line, $m9)) {
|
|
||||||
$items[$i]['pub'] = $str_bool($m9[2], false) ? 1 : 0;
|
|
||||||
} elseif (preg_match('/(private|shared)="(.*?)"/i', $line, $m10)) {
|
|
||||||
$items[$i]['pub'] = $str_bool($m10[2], true) ? 0 : 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
$i++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ksort($items);
|
|
||||||
|
|
||||||
return $items;
|
|
||||||
}
|
|
||||||
|
|
||||||
?>
|
?>
|
||||||
|
Loading…
Reference in New Issue
Block a user