Reapply breaking changes

of switching to WHATWG URL normalizer
This commit is contained in:
Marcel Klehr 2019-08-30 22:37:28 +02:00
parent 422ffb95bd
commit 2d9d2cdb24
5 changed files with 15 additions and 417 deletions

View File

@ -6,6 +6,7 @@
"pguardiario/phpuri": "1.0.*",
"psr/http-message": "^1.0",
"psr/http-factory": "^1.0",
"psr/http-client": "^0.2.0"
"psr/http-client": "^0.2.0",
"rowbot/url": "^2.0"
}
}

View File

@ -1,341 +1,15 @@
<?php
namespace OCA\Bookmarks;
use Rowbot\URL\URL;
use Rowbot\URL\Exception\TypeError;
class UrlNormalizer {
private $normalizer;
const DEFAULT_SCHEME = 42;
const SCHEMES = ['http', 'https', 'ftp', 'sftp', 'file', 'gopher', 'imap', 'mms',
'news', 'nntp', 'telnet', 'prospero', 'rsync', 'rtsp', 'rtspu',
'svn', 'git', 'ws', 'wss'];
const SCHEME_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ';
const IP_CHARS = '0123456789.:';
const DEFAULT_PORT = [
'http'=> '80',
'https'=> '443',
'ws'=> '80',
'wss'=> '443',
'ftp'=> '21',
'sftp'=> '22',
'ldap'=> '389'
];
const QUOTE_EXCEPTIONS = [
'path'=> ' /?+#~',
'query'=> ' &=+#',
'fragment'=> ' +#/'
];
public function __construct() {
}
/**
* @brief Normalize Url
* @param string $url Url to load and analyze
* @return string Normalized url;
*/
public function normalize($url) {
try {
return self::_normalize($url);
} catch (\Exception $e) {
return $url;
}
}
/**
* @brief Normalize Url
* @param string $url Url to load and analyze
* @return string Normalized url;
*/
public static function _normalize($url) {
$url = trim($url);
if ($url === '') {
return '';
}
$parts = self::split($url);
if (isset($parts['scheme']) && strlen($parts['scheme']) > 0 || $parts['scheme'] === self::DEFAULT_SCHEME) {
$netloc = $parts['netloc'];
if (in_array($parts['scheme'], self::SCHEMES)) {
$path = self::normalize_path($parts['path']);
} else {
$path = $parts['path'];
}
# url is relative, netloc (if present) is part of path
} else {
$netloc = $parts['path'];
$path = '';
if (strpos($netloc, '/') !== false) {
$pos = strpos($netloc, '/');
$newnetloc = substr($netloc, 0, $pos);
$path_raw = substr($netloc, $pos+1);
$netloc = $newnetloc;
$path = self::normalize_path('/' . $path_raw);
}
}
list($username, $password, $host, $port) = self::split_netloc($netloc);
$host = self::normalize_host($host);
$port = self::normalize_port($parts['scheme'], $port);
$query = self::normalize_query($parts['query']);
$fragment = self::normalize_fragment($parts['fragment']);
return self::construct(['scheme' => $parts['scheme'], 'username' => $username, 'password' => $password, 'host' => $host, 'port' => $port, 'path' => $path, 'query' => $query, 'fragment' => $fragment]);
}
public static function construct($parts) {
$url = '';
if ($parts['scheme'] === self::DEFAULT_SCHEME) {
$url .= '//';
} elseif (strlen($parts['scheme'])>0) {
if (in_array($parts['scheme'], self::SCHEMES)) {
$url .= $parts['scheme'] . '://';
} else {
$url .= $parts['scheme'] . ':';
}
}
if (strlen($parts['username'])>0 && strlen($parts['password'])>0) {
$url .= $parts['username'] . ':' . $parts['password'] . '@';
} elseif (strlen($parts['username'])>0) {
$url .= $parts['username'] . '@';
}
$url .= $parts['host'];
if (strlen($parts['port'])>0) {
$url .= ':' . $parts['port'];
}
if (strlen($parts['path'])>1 || strlen($parts['query']) > 0 || strlen($parts['fragment']) > 0) {
$url .= $parts['path'];
}
if (strlen($parts['query'])>0) {
$url .= '?' . $parts['query'];
}
if (strlen($parts['fragment'])>0) {
$url .= '#' . $parts['fragment'];
}
return $url;
}
public static function normalize_host($host) {
if (strpos($host, 'xn--') !== false) {
return $host;
}
return idn_to_ascii($host, IDNA_NONTRANSITIONAL_TO_ASCII, INTL_IDNA_VARIANT_UTS46);
}
public static function normalize_port($scheme, $port) {
if (!isset($scheme) || $scheme === '') {
return $port;
}
if (isset($port) && $port !== '' && ($scheme === self::DEFAULT_SCHEME || $port != self::DEFAULT_PORT[$scheme])) {
return $port;
}
return '';
}
public static function normalize_path($path) {
if (in_array($path, ['//', '/', ''])) {
return '/';
}
$npath = self::get_absolute_path(self::unquote($path, self::QUOTE_EXCEPTIONS['path']));
if (substr($path, strlen($path)-1, 1) === '/' && $npath != '/') {
$npath .= '/';
}
return $npath;
}
public static function get_absolute_path($path) {
$parts = array_filter(explode('/', $path), 'strlen');
$absolutes = [];
foreach ($parts as $part) {
if ('.' == $part) {
continue;
}
if ('..' == $part) {
array_pop($absolutes);
} else {
$absolutes[] = $part;
}
}
return '/'.implode('/', $absolutes);
}
public static function normalize_query($query) {
if ($query === '' || strlen($query) <= 2) {
return '';
}
$nquery = self::unquote($query, self::QUOTE_EXCEPTIONS['query']);
if (strpos($nquery, ';') !== false && strpos($nquery, '&') === false) {
return $nquery;
}
$params = explode('&', $nquery);
$nparams = [];
foreach ($params as $param) {
array_push($nparams, $param);
}
sort($nparams);
return implode('&', $nparams);
}
public static function normalize_fragment($fragment) {
return self::unquote($fragment, self::QUOTE_EXCEPTIONS['fragment']);
}
public static function unquote($text, $exceptions=[]) {
$r = '';
$k = 0;
while ($k < strlen($text)) {
$c = substr($text, $k, 1);
if ($c !== '%') {
if (ord($c) >= 128 || ord($c) <= 32 || preg_match('/[a-zA-Z0-9]/', $c) == false && strpos($exceptions, $c) === false) {
$revert = ['%21'=>'!', '%2A'=>'*', '%27'=>"'", '%28'=>'(', '%29'=>')'];
$s = strtr(rawurlencode($c), $revert);
} else {
$s = $c;
}
} else {
$start = $k;
if ($k + 2 >= strlen($text)) {
throw new \Exception('URIError');
}
if (preg_match('/[0-9a-fA-F]/', substr($text, $k + 1, 1)) == false || preg_match('/[0-9a-fA-F]/', substr($text, $k + 2, 1)) == false) {
throw new \Exception('URIError');
}
$b = hexdec(substr($text, $k + 1, 2));
$k += 2;
if ($b <= 32) {
// noop
$s = substr($text, $start, $k - $start +1);
} elseif (($b & (1 << 7)) == 0) {
$c = chr($b);
if (preg_match('/[a-zA-Z0-9]/', $c) == false && strpos($exceptions, $c) === false) {
$s = substr($text, $start, $k - $start +1);
} else {
$s = $c;
}
} else {
$n = 0;
while ((($b << $n) & 0x80) !== 0) {
$n++;
}
if ($n === 1 || $n > 4) {
throw new \Exception('URIError');
}
if ($k + 3 * ($n -1) > strlen($text)) {
throw new \Exception('URIError');
}
$j = 1;
while ($j < $n) {
$k++;
if (substr($text, $k, 1) !== '%') {
throw new \Exception('URIError');
}
if (preg_match('/[0-9a-fA-F]/', substr($text, $k+1, 1)) == false || preg_match('/[0-9a-fA-F]/', substr($text, $k+2, 1)) == false) {
throw new \Exception('URIError');
}
$k += 2;
$j++;
}
$s = substr($text, $start, $k - $start +1);
}
}
$r .= $s;
$k++;
}
return $r;
}
public static function split($url) {
$scheme = $netloc = $path = $query = $fragment = '';
$ip6_start = strpos($url, '[');
$scheme_end = strpos($url, ':');
if ($ip6_start !== false && $scheme_end !== false && $ip6_start < $scheme_end) {
$scheme_end = -1;
}
if (substr($url, 0, 2) === '//') {
$scheme = self::DEFAULT_SCHEME;
$rest = substr($url, 2);
}
if ($scheme === '' && $scheme_end > 0) {
for ($i = 0; $i < $scheme_end; $i++) {
$c = $url[$i];
if (strpos(self::SCHEME_CHARS, $c) === false) {
break;
} else {
$scheme = strtolower(substr($url, 0, $scheme_end));
$rest = ltrim(substr($url, $scheme_end), ':/');
}
}
}
if ($scheme === '') {
$rest = $url;
}
$l_path = strpos($rest, '/');
$l_query = strpos($rest, '?');
$l_frag = strpos($rest, '#');
if ($l_path > 0 && (($l_frag > $l_path && $l_frag > 0) || ($l_query > $l_path && $l_query > 0) || $l_query === false && $l_frag === false)) {
if ($l_query > 0 && $l_frag > 0) {
$netloc = substr($rest, 0, $l_path);
$path = substr($rest, $l_path, min($l_query, $l_frag)-$l_path);
} elseif ($l_query > 0) {
if ($l_query > $l_path) {
$netloc = substr($rest, 0, $l_path);
$path = substr($rest, $l_path, $l_query-$l_path);
} else {
$netloc = substr($rest, 0, $l_query);
$path = '';
}
} elseif ($l_frag > 0) {
$netloc = substr($rest, 0, $l_path);
$path = substr($rest, $l_path, $l_frag-$l_path);
} else {
$netloc = substr($rest, 0, $l_path);
$path = substr($rest, $l_path);
}
} else {
if ($l_query > 0 && ($l_frag > $l_query || $l_frag === false)) {
$netloc = substr($rest, 0, $l_query);
} elseif ($l_frag > 0) {
$netloc = substr($rest, 0, $l_frag);
} else {
$netloc = $rest;
}
}
if ($l_query > 0 && ($l_frag > $l_query || $l_frag === false)) {
if ($l_frag > 0) {
$query = substr($rest, $l_query+1, $l_frag-($l_query+1));
} else {
$query = substr($rest, $l_query+1);
}
}
if ($l_frag > 0) {
$fragment = substr($rest, $l_frag+1);
}
if ($scheme === '') {
$path = $netloc . $path;
$netloc = '';
}
return ['scheme' => $scheme, 'netloc'=> $netloc, 'path' => $path, 'query'=>$query, 'fragment' => $fragment];
}
public static function _clean_netloc($netloc) {
return strtolower(rtrim($netloc, '.:'));
}
public static function split_netloc($netloc) {
$username = $password = $host = $port = '';
if (strpos($netloc, '@') !== false) {
$user_pw = substr($netloc, 0, strpos($netloc, '@'));
$netloc = substr($netloc, strpos($netloc, '@')+1);
if (strpos($user_pw, ':') !== false) {
$username = substr($user_pw, 0, strpos($user_pw, ':'));
$password = substr($user_pw, strpos($user_pw, ':')+1);
} else {
$username = $user_pw;
}
}
$netloc = self::_clean_netloc($netloc);
if (strpos($netloc, ':') !== false && substr($netloc, strlen($netloc)-1, 1) !== ']') {
$host = substr($netloc, 0, strpos($netloc, ':'));
$port = substr($netloc, strpos($netloc, ':')+1);
} else {
$host = $netloc;
}
return [$username, $password, $host, $port];
public function normalize($urlString) {
$url = new URL($urlString);
return $url->href;
}
}

View File

@ -81,7 +81,7 @@ class Test_BookmarkController extends TestCase {
$output = $this->controller->getSingleBookmark($this->testSubjectPublicBmId);
$data = $output->getData();
$this->assertEquals('success', $data['status']);
$this->assertEquals("https://9gag.com", $data['item']['url']);
$this->assertEquals("https://9gag.com/", $data['item']['url']);
}
public function testPublicReadSuccess() {
@ -90,7 +90,7 @@ class Test_BookmarkController extends TestCase {
$output = $this->publicController->getSingleBookmark($this->testSubjectPublicBmId, $this->userid);
$data = $output->getData();
$this->assertEquals('success', $data['status']);
$this->assertEquals("https://9gag.com", $data['item']['url']);
$this->assertEquals("https://9gag.com/", $data['item']['url']);
}
public function testPublicReadFailure() {
@ -172,7 +172,7 @@ class Test_BookmarkController extends TestCase {
$this->controller->editBookmark($id, 'https://www.heise.de', null, '', true, $id, '');
$bookmark = $this->libBookmarks->findUniqueBookmark($id, $this->userid);
$this->assertEquals("https://www.heise.de", $bookmark['url']); // normalized URL
$this->assertEquals("https://www.heise.de/", $bookmark['url']); // normalized URL
}
public function testPrivateDeleteBookmark() {

View File

@ -114,13 +114,13 @@ class Test_LibBookmarks_Bookmarks extends TestCase {
$this->assertFalse(isset($resultOne['lastmodified']));
$this->assertCount(0, $resultOne['tags']);
$this->assertEquals('Golem', $resultOne['title']);
$this->assertEquals('http://www.golem.de', $resultOne['url']);
$this->assertEquals('http://www.golem.de/', $resultOne['url']);
$resultTwo = $resultSet[1];
$this->assertFalse(isset($resultTwo['lastmodified']));
$this->assertCount(0, $resultTwo['tags']);
$this->assertEquals('Google', $resultTwo['title']);
$this->assertEquals('http://www.google.de', $resultTwo['url']);
$this->assertEquals('http://www.google.de/', $resultTwo['url']);
}
public function testFindTags() {
@ -244,7 +244,7 @@ class Test_LibBookmarks_Bookmarks extends TestCase {
$this->libBookmarks->editBookmark($this->userid, $id, "https://www.google.de", "NewTitle", ["three", "four"]);
$bookmark = $this->libBookmarks->findUniqueBookmark($id, $this->userid);
$this->assertEquals("NewTitle", $bookmark['title']);
$this->assertEquals("https://www.google.de", $bookmark['url']);
$this->assertEquals("https://www.google.de/", $bookmark['url']);
$this->assertCount(2, $bookmark['tags']);
$this->assertTrue(in_array('four', $bookmark['tags']));
$this->assertTrue(in_array('three', $bookmark['tags']));
@ -252,7 +252,7 @@ class Test_LibBookmarks_Bookmarks extends TestCase {
// Make sure nothing else changed
$control_bookmark = $this->libBookmarks->findUniqueBookmark($control_bm_id, $this->userid);
$this->assertEquals("Golem", $control_bookmark['title']);
$this->assertEquals("https://www.golem.de", $control_bookmark['url']);
$this->assertEquals("https://www.golem.de/", $control_bookmark['url']);
$this->assertEquals($control_bookmark['tags'], ['four']);
}

View File

@ -1,77 +0,0 @@
<?php
namespace OCA\Bookmarks\Tests;
use OCA\Bookmarks\UrlNormalizer;
class Test_UrlNormalizer extends TestCase {
private $url;
protected function setUp() {
parent::setUp();
$this->url = new UrlNormalizer();
}
public function testGetSorting() {
$data = [
['sindresorhus.com', 'sindresorhus.com'],
['sindresorhus.com ', 'sindresorhus.com'],
['sindresorhus.com.', 'sindresorhus.com'],
['HTTP://sindresorhus.com', 'http://sindresorhus.com'],
['//sindresorhus.com', '//sindresorhus.com'],
['http://sindresorhus.com', 'http://sindresorhus.com'],
['http://sindresorhus.com:80', 'http://sindresorhus.com'],
['https://sindresorhus.com:443', 'https://sindresorhus.com'],
['ftp://sindresorhus.com:21', 'ftp://sindresorhus.com'],
['http://www.sindresorhus.com', 'http://www.sindresorhus.com'],
['www.com', 'www.com'],
['http://www.www.sindresorhus.com', 'http://www.www.sindresorhus.com'],
['www.sindresorhus.com', 'www.sindresorhus.com'],
['http://sindresorhus.com/foo/', 'http://sindresorhus.com/foo/'],
['sindresorhus.com/?foo=bar baz', 'sindresorhus.com/?foo=bar%20baz'],
['https://foo.com/?foo=http://bar.com', 'https://foo.com/?foo=http%3A%2F%2Fbar.com'],
['http://sindresorhus.com/%7Efoo/', 'http://sindresorhus.com/~foo/'],
['http://sindresorhus.com/foo/######/blablabla', 'http://sindresorhus.com/foo/######/blablabla'],
['https://mylink.com/#/#/#/#/#/', 'https://mylink.com/#/#/#/#/#/'],
['http://google.com####/foobar', 'http://google.com/####/foobar'],
['http://sindresorhus.com/?', 'http://sindresorhus.com'],
['http://êxample.com', 'http://xn--xample-hva.com'],
['http://sindresorhus.com/?b=bar&a=foo', 'http://sindresorhus.com/?a=foo&b=bar'],
['http://sindresorhus.com/?foo=bar*|<>:"', 'http://sindresorhus.com/?foo=bar*%7C%3C%3E%3A%22'],
['http://sindresorhus.com:5000', 'http://sindresorhus.com:5000'],
['//sindresorhus.com:80/', '//sindresorhus.com:80'],
['http://sindresorhus.com/foo#bar', 'http://sindresorhus.com/foo#bar'],
['http://sindresorhus.com/foo/bar/../baz', 'http://sindresorhus.com/foo/baz'],
['http://sindresorhus.com/foo/bar/./baz', 'http://sindresorhus.com/foo/bar/baz'],
['https://i.vimeocdn.com/filter/overlay?src0=https://i.vimeocdn.com/video/598160082_1280x720.jpg&src1=https://f.vimeocdn.com/images_v6/share/play_icon_overlay.png', 'https://i.vimeocdn.com/filter/overlay?src0=https%3A%2F%2Fi.vimeocdn.com%2Fvideo%2F598160082_1280x720.jpg&src1=https%3A%2F%2Ff.vimeocdn.com%2Fimages_v6%2Fshare%2Fplay_icon_overlay.png'],
// authorization
['https://user:password@www.sindresorhus.com', 'https://user:password@www.sindresorhus.com'],
['https://user:password@www.sindresorhus.com/@user', 'https://user:password@www.sindresorhus.com/%40user'],
['http://user:password@www.êxample.com', 'http://user:password@www.xn--xample-hva.com'],
// query params
['http://sindresorhus.com/?a=Z&b=Y&c=X&d=W', 'http://sindresorhus.com/?a=Z&b=Y&c=X&d=W'],
['http://sindresorhus.com/?b=Y&c=X&a=Z&d=W', 'http://sindresorhus.com/?a=Z&b=Y&c=X&d=W'],
['http://sindresorhus.com/?a=Z&d=W&b=Y&c=X', 'http://sindresorhus.com/?a=Z&b=Y&c=X&d=W'],
//['https://www.tivocommunity.com/community/index.php?threads/av-jack-wiring.502081/', 'https://www.tivocommunity.com/community/index.php?threads/av-jack-wiring.502081/'],
// encoding
['http://sindresorhus.com/foo%0cbar/?a=Z&d=W&b=Y&c=X%0c', 'http://sindresorhus.com/foo%0cbar/?a=Z&b=Y&c=X%0c&d=W'],
['http://sindresorhus.com////foo/bar', 'http://sindresorhus.com/foo/bar'],
['http://sindresorhus.com////foo////bar', 'http://sindresorhus.com/foo/bar'],
['//sindresorhus.com//foo', '//sindresorhus.com//foo'], // cannot normalize path if we don't know the protocol
['http://sindresorhus.com:5000///foo', 'http://sindresorhus.com:5000/foo'],
['http://sindresorhus.com///foo', 'http://sindresorhus.com/foo'],
['http://sindresorhus.com:5000//foo', 'http://sindresorhus.com:5000/foo'],
['http://sindresorhus.com//foo', 'http://sindresorhus.com/foo']
];
foreach ($data as $item) {
echo $item[0]."\r\n";
// correctness
$this->assertEquals($item[1], $this->url->normalize($item[0]));
// idempotence
$this->assertEquals($item[1], $this->url->normalize($this->url->normalize($item[0])));
}
}
}