# -*- coding: utf-8 -*- from __future__ import annotations import abc import logging from pyquery import PyQuery import requests from hotpocket_backend._meta import version as backend_version from hotpocket_backend.apps.bot.conf import bot_settings from hotpocket_backend.apps.bot.dto.strategy import FetchResult from hotpocket_common.url import URL from hotpocket_soa.dto import BotResultOut LOGGER = logging.getLogger(__name__) class Strategy(abc.ABC): class StrategyError(Exception): pass class FetchError(StrategyError): pass class RuntimeError(StrategyError): pass USER_AGENT = ( 'Mozilla/5.0 ' '(' 'compatible; ' f'BTHLabsHotPocketBot/{backend_version}; ' '+https://hotpocket.app/bot.txt' ')' ) TITLE_TAG_SELECTORS = [ 'head > meta[name=title]', 'head > meta[property="og:title"]', 'head > title', ] DESCRIPTION_TAG_SELECTORS = [ 'head > meta[property="og:description"]', 'head > meta[name=description]', ] def __init__(self, url: str): super().__init__() self.url = url self.parsed_url = URL(self.url) self.logger = self.get_logger() def get_logger(self) -> logging.Logger: return LOGGER.getChild(self.__class__.__name__) def is_netloc_banned(self) -> bool: result = False for banned_netloc in bot_settings.BANNED_HOSTNAMES: hostname = self.parsed_url.hostname if hostname is not None and hostname.endswith(banned_netloc) is True: result = True break return result def fetch(self, url: str) -> FetchResult: try: response = requests.request( 'GET', url, headers={ 'User-Agent': self.USER_AGENT, }, ) response.raise_for_status() return FetchResult.model_validate(dict( status_code=response.status_code, content=response.content, content_type=response.headers.get('Content-Type', None), encoding=response.encoding or response.apparent_encoding, )) except Exception as exception: self.logger.error( 'Fetch error: %s', exception, exc_info=True, ) raise self.FetchError() from exception def extract_title_and_description_from_html(self, content: str) -> tuple[str | None, str | None]: dom = PyQuery(content) title: str | None = None description: str | None = None for selector in self.TITLE_TAG_SELECTORS: title_tags = dom.find(selector) if len(title_tags) > 0: title_tag = PyQuery(title_tags[0]) if title_tag.is_('meta'): title = title_tag.attr('content') else: title = title_tag.text() break for selector in self.DESCRIPTION_TAG_SELECTORS: description_tags = dom.find(selector) if len(description_tags) > 0: description = PyQuery(description_tags[0]).attr('content') break if description is None: try: description = PyQuery(dom.find('p')[0]).text() except IndexError: pass return ( title.strip() or None if title is not None else None, description.strip() or None if description is not None else None, ) def run(self) -> BotResultOut: result = BotResultOut.model_validate(dict( title=None, description=None, is_netloc_banned=False, )) result.is_netloc_banned = self.is_netloc_banned() if result.is_netloc_banned is False: fetch_result = self.fetch(self.url) try: assert fetch_result.content is not None, ( 'Received empty content' ) assert fetch_result.content_type is not None, ( 'Unable to determine the content type' ) assert fetch_result.content_type.startswith('text/html') is True, ( f'Unsupported content type: `{fetch_result.content_type}`' ) except AssertionError as exception: self.logger.error( 'Unprocessable fetch result: %s', exception, exc_info=exception, ) raise self.RuntimeError(exception.args[0]) from exception try: decoded_content = fetch_result.content.decode(fetch_result.encoding) title, description = self.extract_title_and_description_from_html( decoded_content, ) result.title = title result.description = description except Exception as exception: self.logger.error( 'Processing error: %s', exception, exc_info=exception, ) raise self.RuntimeError() from exception else: self.logger.debug('Skipping banned netloc: url=`%s`', self.url) return result