bookmarks/lib/Service/CrawlService.php

170 lines
5.4 KiB
PHP

<?php
/*
* Copyright (c) 2020-2024. The Nextcloud Bookmarks contributors.
*
* This file is licensed under the Affero General Public License version 3 or later. See the COPYING file.
*/
namespace OCA\Bookmarks\Service;
use Exception;
use fivefilters\Readability\Configuration;
use fivefilters\Readability\Readability;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Response;
use Mimey\MimeTypes;
use OC\User\NoUserException;
use OCA\Bookmarks\Db\Bookmark;
use OCA\Bookmarks\Db\BookmarkMapper;
use OCA\Bookmarks\Exception\UrlParseError;
use OCP\Files\Folder;
use OCP\Files\GenericFileException;
use OCP\Files\InvalidPathException;
use OCP\Files\IRootFolder;
use OCP\Files\NotFoundException;
use OCP\Files\NotPermittedException;
use OCP\IConfig;
use OCP\IL10N;
use OCP\Lock\LockedException;
use Psr\Log\LoggerInterface;
class CrawlService {
public const MAX_BODY_LENGTH = 92160000; // 90 MB
public const TIMEOUT = 10;
public const CONNECT_TIMEOUT = 10;
public const READ_TIMEOUT = 10;
public const UA_FIREFOX = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0';
private MimeTypes $mimey;
public function __construct(
private BookmarkMapper $bookmarkMapper,
private BookmarkPreviewer $bookmarkPreviewer,
private FaviconPreviewer $faviconPreviewer,
private IConfig $config,
private IRootFolder $rootFolder,
private IL10N $l,
private LoggerInterface $logger,
private UserSettingsService $userSettingsService) {
$this->mimey = new MimeTypes;
}
/**
* @param Bookmark $bookmark
* @throws UrlParseError
*/
public function crawl(Bookmark $bookmark): void {
if (!$bookmark->isWebLink()) {
return;
}
try {
$client = new Client();
/** @var Response $resp */
$resp = $client->get($bookmark->getUrl(), [
'headers' => [
'User-Agent' => self::UA_FIREFOX,
],
'connect_timeout' => self::CONNECT_TIMEOUT,
'timeout' => self::TIMEOUT,
'read_timeout' => self::READ_TIMEOUT,
'http_errors' => false
]);
$available = $resp ? $resp->getStatusCode() !== 404 : false;
} catch (Exception $e) {
$this->logger->warning($e->getMessage());
$available = false;
}
if ($available) {
$this->userSettingsService->setUserId($bookmark->getUserId());
if (((boolean) $this->userSettingsService->get('archive.enabled')) === true) {
$this->archiveFile($bookmark, $resp);
$this->archiveContent($bookmark, $resp);
}
$this->bookmarkPreviewer->getImage($bookmark);
$this->faviconPreviewer->getImage($bookmark);
}
$bookmark->markPreviewCreated();
$bookmark->setAvailable($available);
$this->bookmarkMapper->update($bookmark);
}
private function archiveContent(Bookmark $bookmark, Response $resp) : void {
$header = $resp->getHeader('Content-type');
if(empty($header)) {
return;
}
$contentType = $header[0];
if ((bool)preg_match('#text/html#i', $contentType) === true && ($bookmark->getHtmlContent() === null || $bookmark->getHtmlContent() === '')) {
$config = new Configuration();
$config
->setFixRelativeURLs(true)
->setOriginalURL($bookmark->getUrl())
->setSubstituteEntities(true);
$readability = new Readability($config);
try {
$readability->parse($resp->getBody());
} catch (\Throwable $e) {
$this->logger->debug(get_class($e)." ".$e->getMessage()."\r\n".$e->getTraceAsString());
return;
}
$bookmark->setHtmlContent($readability->getContent());
$bookmark->setTextContent(strip_tags($readability->getContent()));
}
}
private function archiveFile(Bookmark $bookmark, Response $resp) :void {
$header = $resp->getHeader('Content-type');
if(empty($header)) {
return;
}
$contentType = $header[0];
if ((bool)preg_match('#text/html#i', $contentType) === false && $bookmark->getArchivedFile() === null && (int)$resp->getHeader('Content-length')[0] < self::MAX_BODY_LENGTH) {
try {
$userFolder = $this->rootFolder->getUserFolder($bookmark->getUserId());
$folderPath = $this->getArchivePath($bookmark, $userFolder);
$name = $bookmark->slugify('title');
$extension = $this->mimey->getExtension($contentType);
if (!$extension || trim($extension) === '') {
$extension = 'txt';
}
$path = $folderPath . '/' . $name . '.' . $extension;
$i = 0;
while ($userFolder->nodeExists($path)) {
$path = $folderPath . '/' .$name . '_' . $i . '.' . $extension;
$i++;
}
$file = $userFolder->newFile($path);
$file->putContent($resp->getBody());
$bookmark->setArchivedFile($file->getId());
$this->bookmarkMapper->update($bookmark);
} catch (NotPermittedException | NoUserException | GenericFileException | LockedException | UrlParseError | InvalidPathException | NotFoundException $e) {
$this->logger->debug(get_class($e)." ".$e->getMessage()."\r\n".$e->getTraceAsString());
}
}
}
private function getArchivePath(Bookmark $bookmark, Folder $userFolder): string {
$folderPath = $this->config->getUserValue($bookmark->getUserId(), 'bookmarks', 'archive.filePath', $this->l->t('Bookmarks'));
$this->getOrCreateFolder($userFolder, $folderPath);
return $folderPath;
}
public function getOrCreateFolder(Folder $userFolder, string $path) : ?Folder {
if ($path === '/') {
return $userFolder;
}
if ($userFolder->nodeExists($path)) {
$folder = $userFolder->get($path);
} else {
$folder = $userFolder->newFolder($path);
}
if (!($folder instanceof Folder)) {
return null;
}
return $folder;
}
}