From e567e30ce0214e3ce26e12b0910773b9dc03a104 Mon Sep 17 00:00:00 2001 From: yohan <783b8c87@scimetis.net> Date: Fri, 18 Sep 2015 20:19:37 +0200 Subject: [PATCH] Separate Netscape bookmark parser and add a license header. --- .../parse_netscape_bookmarks.php | 149 ++++++++++++++++++ www/importNetscape.php | 133 +--------------- 2 files changed, 156 insertions(+), 126 deletions(-) create mode 100644 src/SemanticScuttle/parse_netscape_bookmarks.php diff --git a/src/SemanticScuttle/parse_netscape_bookmarks.php b/src/SemanticScuttle/parse_netscape_bookmarks.php new file mode 100644 index 0000000..228ce61 --- /dev/null +++ b/src/SemanticScuttle/parse_netscape_bookmarks.php @@ -0,0 +1,149 @@ +'; +var_dump(parse_netscape_bookmarks(file_get_contents('bookmarks_export.htm'))); +*/ + +function parse_netscape_bookmarks($bkmk_str, $default_tag = null) { + $i = 0; + $next = false; + $items = []; + + $current_tag = $default_tag = $default_tag ?: 'autoimported-'.date("Ymd"); + + $bkmk_str = str_replace(["\r","\n","\t"], ['','',' '], $bkmk_str); + + $bkmk_str = preg_replace_callback('@
(.*?)('.str_replace(["\r", "\n"], ['', '
'], trim($m[1])).'(\s*?)\n<", $bkmk_str); + $bkmk_str = preg_replace('/(/', "\n", $bkmk_str); + $lines = explode("\n", $bkmk_str); + + $str_bool = function($str, $default = false) { + if (!$str) { + return false; + } elseif (!is_string($str) && $str) { + return true; + } + + $true = 'y|yes|on|checked|ok|1|true|array|\+|okay|yes+|t|one'; + $false = 'n|no|off|empty|null|false|0|-|exit|die|neg|f|zero|void'; + + if (preg_match("/^($true)$/i", $str)) { + return true; + } elseif (preg_match("/^($false)$/i", $str)) { + return false; + } + + return $default; + }; + $tags = array($default_tag); + foreach ($lines as $line_no => $line) { + /* If we match a tag, set current tag to that, if
, stop tag. */ + if (preg_match('/^(.*?)<\/h\d>/i', $line, $m1)) { + $current_tag = trim(preg_replace("/\s+/", "_", strtr($m1[2], ', /+', '____'))); + $tags[] = $current_tag; + continue; + } elseif (preg_match('/^<\/DL>/i', $line)) { + $current_tag = $default_tag; + array_pop($tags); + } + + if (preg_match('/(.*?)<\/a>/i', $line, $m4)) { + $items[$i]['title'] = $m4[2]; + // $items[$i]['slug'] = slugify($m4[2]); + } else { + $items[$i]['title'] = 'untitled'; + // $items[$i]['slug'] = ''; + } + + if (preg_match('/note="(.*?)"<\/a>/i', $line, $m5)) { + $items[$i]['note'] = $m5[1]; + } elseif (preg_match('/
(.*?)<\//i', $line, $m6)) { + $items[$i]['note'] = str_replace('
', "\n", $m6[1]); + } else { + $items[$i]['note'] = ''; + } + + if (preg_match('/(tags?|labels?|folders?)="(.*?)"/i', $line, $m7)) { + array_unique(array_merge($items[$i]['tags'], explode(' ', trim(preg_replace("/\s+/", " ", strtr($m7[2], ',', ' ')))))); + } + if (preg_match('/add_date="(.*?)"/i', $line, $m8)) { + $items[$i]['time'] = $m8[1]; + } else { + $items[$i]['time'] = time(); + } + + if (preg_match('/(public|published|pub)="(.*?)"/i', $line, $m9)) { + $items[$i]['pub'] = $str_bool($m9[2], false) ? 1 : 0; + } elseif (preg_match('/(private|shared)="(.*?)"/i', $line, $m10)) { + $items[$i]['pub'] = $str_bool($m10[2], true) ? 0 : 1; + } + + $i++; + } + } + ksort($items); + + return $items; +} + +?> diff --git a/www/importNetscape.php b/www/importNetscape.php index 89d75b7..b0b7b37 100644 --- a/www/importNetscape.php +++ b/www/importNetscape.php @@ -20,6 +20,13 @@ ***************************************************************************/ require_once 'www-header.php'; +if ('@data_dir@' == '@' . 'data_dir@') { + //non pear-install + require_once dirname(__FILE__) . '/../src/SemanticScuttle/parse_netscape_bookmarks.php'; +} else { + //pear installation; files are in include path + require_once 'SemanticScuttle/parse_netscape_bookmarks.php'; +} /* Service creation: only useful services are created */ $bookmarkservice =SemanticScuttle_Service_Factory::get('Bookmark'); @@ -101,130 +108,4 @@ if ($userservice->isLoggedOn() && sizeof($_FILES) > 0 && $_FILES['userfile']['si $templateservice->loadTemplate($templatename, $tplVars); } - -/** - * Basically netscape bookmark files often come so badly formed, there's - * no reliable way I could find to parse them with DOM or SimpleXML, - * even after running HTML Tidy on them. So, this function does a bunch of - * transformations on the general format of a netscape bookmark file, to get - * Each bookmark and its description onto one line, and goes through line by - * line, matching tags and attributes. It's messy, but it works better than - * anything I could find in hours of googling, and anything that I could - * write after hours with DOM and SimpleXML. I didn't want to pull in a big - * DOM parsing library just to do this one thing, so this is it. - * @todo - running Tidy before doing this might be beneficial. - * ?? $bkmk_str = tidy_parse_string($bkmk_str)->cleanRepair(); - * - * Update 2013-07-08: - * Just tested this on an export of some bookmarks from Pinboard.in - * and it seems that it is still working, so good for me. - */ - -/* -print '
';
-var_dump(parse_netscape_bookmarks(file_get_contents('bookmarks_export.htm')));
-*/
-
-function parse_netscape_bookmarks($bkmk_str, $default_tag = null) {
-    $i = 0;
-    $next = false;
-    $items = [];
-
-    $current_tag = $default_tag = $default_tag ?: 'autoimported-'.date("Ymd");
-
-    $bkmk_str = str_replace(["\r","\n","\t"], ['','',' '], $bkmk_str);
-
-    $bkmk_str = preg_replace_callback('@
(.*?)('.str_replace(["\r", "\n"], ['', '
'], trim($m[1])).'(\s*?)\n<", $bkmk_str); - $bkmk_str = preg_replace('/(/', "\n
", $bkmk_str); - $lines = explode("\n", $bkmk_str); - - $str_bool = function($str, $default = false) { - if (!$str) { - return false; - } elseif (!is_string($str) && $str) { - return true; - } - - $true = 'y|yes|on|checked|ok|1|true|array|\+|okay|yes+|t|one'; - $false = 'n|no|off|empty|null|false|0|-|exit|die|neg|f|zero|void'; - - if (preg_match("/^($true)$/i", $str)) { - return true; - } elseif (preg_match("/^($false)$/i", $str)) { - return false; - } - - return $default; - }; - $tags = array($default_tag); - foreach ($lines as $line_no => $line) { - /* If we match a tag, set current tag to that, if
, stop tag. */ - if (preg_match('/^(.*?)<\/h\d>/i', $line, $m1)) { - $current_tag = trim(preg_replace("/\s+/", "_", strtr($m1[2], ', /+', '____'))); - $tags[] = $current_tag; - continue; - } elseif (preg_match('/^<\/DL>/i', $line)) { - $current_tag = $default_tag; - array_pop($tags); - } - - if (preg_match('/(.*?)<\/a>/i', $line, $m4)) { - $items[$i]['title'] = $m4[2]; - // $items[$i]['slug'] = slugify($m4[2]); - } else { - $items[$i]['title'] = 'untitled'; - // $items[$i]['slug'] = ''; - } - - if (preg_match('/note="(.*?)"<\/a>/i', $line, $m5)) { - $items[$i]['note'] = $m5[1]; - } elseif (preg_match('/
(.*?)<\//i', $line, $m6)) { - $items[$i]['note'] = str_replace('
', "\n", $m6[1]); - } else { - $items[$i]['note'] = ''; - } - - if (preg_match('/(tags?|labels?|folders?)="(.*?)"/i', $line, $m7)) { - array_unique(array_merge($items[$i]['tags'], explode(' ', trim(preg_replace("/\s+/", " ", strtr($m7[2], ',', ' ')))))); - } - if (preg_match('/add_date="(.*?)"/i', $line, $m8)) { - $items[$i]['time'] = $m8[1]; - } else { - $items[$i]['time'] = time(); - } - - if (preg_match('/(public|published|pub)="(.*?)"/i', $line, $m9)) { - $items[$i]['pub'] = $str_bool($m9[2], false) ? 1 : 0; - } elseif (preg_match('/(private|shared)="(.*?)"/i', $line, $m10)) { - $items[$i]['pub'] = $str_bool($m10[2], true) ? 0 : 1; - } - - $i++; - } - } - ksort($items); - - return $items; -} - ?>