Re-implement url normalizer based on python implementation

This commit is contained in:
Marcel Klehr 2018-11-28 15:54:27 +01:00
parent 281a07979e
commit 3d9c8d256d
3 changed files with 1064 additions and 909 deletions

View File

@ -6,7 +6,6 @@
"pguardiario/phpuri": "1.0.*",
"psr/http-message": "^1.0",
"psr/http-factory": "^1.0",
"psr/http-client": "^0.2.0",
"webignition/url": "^2.2"
"psr/http-client": "^0.2.0"
}
}

1692
composer.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,11 +1,30 @@
<?php
namespace OCA\Bookmarks\Controller\Lib;
use \webignition\NormalisedUrl\NormalisedUrl;
class UrlNormalizer {
private $normalizer;
const SCHEMES = ['http', 'https', 'ftp', 'sftp', 'file', 'gopher', 'imap', 'mms',
'news', 'nntp', 'telnet', 'prospero', 'rsync', 'rtsp', 'rtspu',
'svn', 'git', 'ws', 'wss'];
const SCHEME_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ';
const IP_CHARS = '0123456789.:';
const DEFAULT_PORT = [
'http'=> '80',
'https'=> '443',
'ws'=> '80',
'wss'=> '443',
'ftp'=> '21',
'sftp'=> '22',
'ldap'=> '389'
];
const QUOTE_EXCEPTIONS = [
'path'=> ' /?+#',
'query'=> ' &=+#',
'fragment'=> ' +#'
];
public function __construct() {
$this->normalizer = new NormalisedUrl();
}
@ -16,7 +35,258 @@ class UrlNormalizer {
* @return string Normalized url;
*/
public function normalize($url) {
$this->normalizer->init($url);
return (string) $this->normalizer;
$url = trim($url);
if ($url === '') {
return '';
}
$parts = self::split($url);
if ($parts['scheme']) {
$netloc = $parts['netloc'];
if (in_array($parts['scheme'], self::SCHEMES)) {
$path = self::normalize_path($parts['path']);
} else {
$path = $parts['path'];
}
# url is relative, netloc (if present) is part of path
} else {
$netloc = $parts['path'];
$path = '';
if (strpos($netloc, '/') !== false) {
$netloc = substr(netloc, 0, strpos($netloc, '/'));
$path_raw = substr($netloc, strpos($netloc, '/')+1);
$path = self::normalize_path('/' + $path_raw);
}
}
list($username, $password, $host, $port) = self::split_netloc($netloc);
$host = self::normalize_host($host);
$port = self::normalize_port($parts['scheme'], $port);
$query = self::normalize_query($parts['query']);
$fragment = self::normalize_fragment($parts['fragment']);
return self::construct(['scheme' => $parts['scheme'], 'username' => $username, 'password' => $password, 'host' => $host, 'port' => $port, 'path' => $path, 'query' => $query, 'fragment' => $fragment]);
}
public static function construct($parts) {
$url = '';
if ($parts['scheme']) {
if (in_array($parts['scheme'], self::SCHEMES)) {
$url .= $parts['scheme'] . '://';
} else {
$url .= $parts['scheme'] . ':';
}
}
if ($parts['username'] && $parts['password']) {
$url .= $parts['username'] . ':' . $parts['password'] . '@';
} elseif ($parts['username']) {
$url .= $parts['username'] . '@';
}
$url .= $parts['host'];
if ($parts['port']) {
$url .= ':' . $parts['port'];
}
if ($parts['path']) {
$url .= $parts['path'];
}
if ($parts['query']) {
$url .= '?' . $parts['query'];
}
if ($parts['fragment']) {
$url .= '#' . $parts['fragment'];
}
return $url;
}
public static function normalize_host($host) {
if (strpos($host, 'xn--') === false) {
return $host;
}
return idn_to_ascii($host);
}
public static function normalize_port($scheme, $port) {
if (!isset($scheme)) {
return $port;
}
if (isset($port) && $port != self::DEFAULT_PORT[$scheme]) {
return $port;
}
return '';
}
public static function normalize_path($path) {
if (in_array($path, ['//', '/', ''])) {
return '/';
}
$npath = self::get_absolute_path(self::unquote($path, self::QUOTE_EXCEPTIONS['path']));
if ($path[count($path)-1] === '/' && $npath != '/') {
$npath .= '/';
}
return $npath;
}
public static function get_absolute_path($path) {
$parts = array_filter(explode('/', $path), 'strlen');
$absolutes = [];
foreach ($parts as $part) {
if ('.' == $part) {
continue;
}
if ('..' == $part) {
array_pop($absolutes);
} else {
$absolutes[] = $part;
}
}
return implode('/', $absolutes);
}
public static function normalize_query($query) {
if ($query === '' || count($query) <= 2) {
return '';
}
$nquery = self::unquote($query, self::QUOTE_EXCEPTIONS['query']);
$params = explode('&', $nquery);
$nparams = [];
foreach ($params as $param) {
if (strpos($param, '=') !== false) {
$k = substr($param, 0, strpos($param, '='));
$v = substr($param, strpos($param, '=')+1);
array_push($nparams, $k.'='.$v);
}
}
sort($nparams);
return implode('&', $nparams);
}
public static function normalize_fragment($fragment) {
return self::unquote($fragment, self::QUOTE_EXCEPTIONS['fragment']);
}
public static function unquote($text, $exceptions=[]) {
$_hextochr = [];
for ($i = 0; $i < 256; $i++) {
$_hextochr[dechex($i)] = chr($i);
$_hextochr[strtoupper(dechex($i))] = chr($i);
}
if (strlen($text) == 0) {
return $text;
}
if (!isset($text)) {
throw new Exception('text is not set and thus cannot be unquoted');
}
if (strpos($text, '%') === false) {
return $text;
}
$s = explode('%', $text);
$res = $s[0];
for ($i=1; $i < count($s); $i++) {
$h = $s[$i];
$c = _hextochr.get(substr($h, 0, 2));
if ($c && false === strpos($exceptions, $c)) {
if (strlen(h) > 2) {
$res .= $c . substr($h, 2);
} else {
$res .= $c;
}
} else {
$res .= '%' . $h;
}
}
return $res;
}
public static function split($url) {
$scheme = $netloc = $path = $query = $fragment = '';
$ip6_start = strpos($url, '[');
$scheme_end = strpos($url, ':');
if ($ip6_start !== false && $scheme_end !== false && $ip6_start < $scheme_end) {
$scheme_end = -1;
}
if ($scheme_end > 0) {
for ($i = 0; $i < $scheme_end; $i++) {
$c = $url[$i];
if (strpos(self::SCHEME_CHARS, $c) === false) {
break;
} else {
$scheme = strtolower(substr(url, 0, $scheme_end));
$rest = ltrim(substr($url, $scheme_end), ':/');
}
}
}
if (!$scheme) {
$rest = $url;
}
$l_path = strpos($rest, '/');
$l_query = strpos($rest, '?');
$l_frag = strpos($rest, '#');
if ($l_path > 0) {
if ($l_query > 0 && $l_frag > 0) {
$netloc = substr(rest, 0, $l_path);
$path = substr($rest, $l_path, min($l_query, $l_frag));
} elseif ($l_query > 0) {
if ($l_query > $l_path) {
$netloc = substr($rest, 0, $l_path);
$path = substr($rest, $l_path, $l_query);
} else {
$netloc = substr($rest, 0, $l_query);
$path = '';
}
} elseif ($l_frag > 0) {
$netloc = substr($rest, 0, $l_path);
$path = substr($rest, $l_path, $l_frag);
} else {
$netloc = substr($rest, 0, $l_path);
$path = substr($rest, $l_path);
}
} else {
if ($l_query > 0) {
$netloc = substr($rest, 0, $l_query);
} elseif ($l_frag > 0) {
$netloc = substr($rest, 0, $l_frag);
} else {
$netloc = $rest;
}
}
if ($l_query > 0) {
if ($l_frag > 0) {
$query = substr($rest, $l_query+1, $l_frag);
} else {
$query = subtr($rest, $l_query+1);
}
}
if ($l_frag > 0) {
$fragment = substr($rest, $l_frag+1);
}
if (!$scheme) {
$path = $netloc . $path;
$netloc = '';
}
return [$scheme, $netloc, $path, $query, $fragment];
}
public static function _clean_netloc($netloc) {
return strtolower(rtrim($netloc, '.:'));
}
public static function split_netloc($netloc) {
$username = $password = $host = $port = '';
if (strpos($netloc, '@') !== false) {
$user_pw = substr($netloc, 0, strpos($netloc, '@'));
$netloc = substr($netloc, strpos($netloc, '@')+1);
if (strpos($user_pw, ':') !== false) {
$username = substr($user_pw, 0, strpos($user_pw, ':'));
$password = substr($user_pw, strpos($user_pw, ':')+1);
} else {
$username = $user_pw;
}
}
$netloc = self::_clean_netloc($netloc);
if (strpos($netloc, ':') !== false && $netloc[count($netloc)-1] !== ']') {
$host = substr($netloc, 0, strpos($netloc, ':'));
$port = substr($netloc, strpos($netloc, ':')+1);
} else {
$host = $netloc;
}
return [$username, $password, $host, $port];
}
}