You've already forked hotpocket
This commit is contained in:
@@ -0,0 +1 @@
|
||||
from .basic import BasicStrategy # noqa: F401
|
||||
174
services/backend/hotpocket_backend/apps/bot/strategy/base.py
Normal file
174
services/backend/hotpocket_backend/apps/bot/strategy/base.py
Normal file
@@ -0,0 +1,174 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import annotations
|
||||
|
||||
import abc
|
||||
import logging
|
||||
|
||||
from pyquery import PyQuery
|
||||
import requests
|
||||
|
||||
from hotpocket_backend._meta import version as backend_version
|
||||
from hotpocket_backend.apps.bot.conf import bot_settings
|
||||
from hotpocket_backend.apps.bot.dto.strategy import FetchResult
|
||||
from hotpocket_common.url import URL
|
||||
from hotpocket_soa.dto import BotResultOut
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Strategy(abc.ABC):
|
||||
class StrategyError(Exception):
|
||||
pass
|
||||
|
||||
class FetchError(StrategyError):
|
||||
pass
|
||||
|
||||
class RuntimeError(StrategyError):
|
||||
pass
|
||||
|
||||
USER_AGENT = (
|
||||
'Mozilla/5.0 '
|
||||
'('
|
||||
'compatible; '
|
||||
f'BTHLabsHotPocketBot/{backend_version}; '
|
||||
'+https://hotpocket.app/bot.txt'
|
||||
')'
|
||||
)
|
||||
TITLE_TAG_SELECTORS = [
|
||||
'head > meta[name=title]',
|
||||
'head > meta[property="og:title"]',
|
||||
'head > title',
|
||||
]
|
||||
DESCRIPTION_TAG_SELECTORS = [
|
||||
'head > meta[property="og:description"]',
|
||||
'head > meta[name=description]',
|
||||
]
|
||||
|
||||
def __init__(self, url: str):
|
||||
super().__init__()
|
||||
self.url = url
|
||||
self.parsed_url = URL(self.url)
|
||||
|
||||
self.logger = self.get_logger()
|
||||
|
||||
def get_logger(self) -> logging.Logger:
|
||||
return LOGGER.getChild(self.__class__.__name__)
|
||||
|
||||
def is_netloc_banned(self) -> bool:
|
||||
result = False
|
||||
|
||||
for banned_netloc in bot_settings.BANNED_HOSTNAMES:
|
||||
hostname = self.parsed_url.hostname
|
||||
if hostname is not None and hostname.endswith(banned_netloc) is True:
|
||||
result = True
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
def fetch(self, url: str) -> FetchResult:
|
||||
try:
|
||||
response = requests.request(
|
||||
'GET',
|
||||
url,
|
||||
headers={
|
||||
'User-Agent': self.USER_AGENT,
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
return FetchResult.model_validate(dict(
|
||||
status_code=response.status_code,
|
||||
content=response.content,
|
||||
content_type=response.headers.get('Content-Type', None),
|
||||
encoding=response.encoding or response.apparent_encoding,
|
||||
))
|
||||
except Exception as exception:
|
||||
self.logger.error(
|
||||
'Fetch error: %s', exception, exc_info=True,
|
||||
)
|
||||
raise self.FetchError() from exception
|
||||
|
||||
def extract_title_and_description_from_html(self, content: str) -> tuple[str | None, str | None]:
|
||||
dom = PyQuery(content)
|
||||
|
||||
title: str | None = None
|
||||
description: str | None = None
|
||||
|
||||
for selector in self.TITLE_TAG_SELECTORS:
|
||||
title_tags = dom.find(selector)
|
||||
if len(title_tags) > 0:
|
||||
title_tag = PyQuery(title_tags[0])
|
||||
if title_tag.is_('meta'):
|
||||
title = title_tag.attr('content')
|
||||
else:
|
||||
title = title_tag.text()
|
||||
|
||||
break
|
||||
|
||||
for selector in self.DESCRIPTION_TAG_SELECTORS:
|
||||
description_tags = dom.find(selector)
|
||||
if len(description_tags) > 0:
|
||||
description = PyQuery(description_tags[0]).attr('content')
|
||||
|
||||
break
|
||||
|
||||
if description is None:
|
||||
try:
|
||||
description = PyQuery(dom.find('p')[0]).text()
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
return (
|
||||
title.strip() or None
|
||||
if title is not None
|
||||
else None,
|
||||
description.strip() or None
|
||||
if description is not None
|
||||
else None,
|
||||
)
|
||||
|
||||
def run(self) -> BotResultOut:
|
||||
result = BotResultOut.model_validate(dict(
|
||||
title=None,
|
||||
description=None,
|
||||
is_netloc_banned=False,
|
||||
))
|
||||
|
||||
result.is_netloc_banned = self.is_netloc_banned()
|
||||
|
||||
if result.is_netloc_banned is False:
|
||||
fetch_result = self.fetch(self.url)
|
||||
|
||||
try:
|
||||
assert fetch_result.content is not None, (
|
||||
'Received empty content'
|
||||
)
|
||||
assert fetch_result.content_type is not None, (
|
||||
'Unable to determine the content type'
|
||||
)
|
||||
assert fetch_result.content_type.startswith('text/html') is True, (
|
||||
f'Unsupported content type: `{fetch_result.content_type}`'
|
||||
)
|
||||
except AssertionError as exception:
|
||||
self.logger.error(
|
||||
'Unprocessable fetch result: %s', exception, exc_info=exception,
|
||||
)
|
||||
raise self.RuntimeError(exception.args[0]) from exception
|
||||
|
||||
try:
|
||||
decoded_content = fetch_result.content.decode(fetch_result.encoding)
|
||||
|
||||
title, description = self.extract_title_and_description_from_html(
|
||||
decoded_content,
|
||||
)
|
||||
result.title = title
|
||||
result.description = description
|
||||
except Exception as exception:
|
||||
self.logger.error(
|
||||
'Processing error: %s', exception, exc_info=exception,
|
||||
)
|
||||
raise self.RuntimeError() from exception
|
||||
else:
|
||||
self.logger.debug('Skipping banned netloc: url=`%s`', self.url)
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,8 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import annotations
|
||||
|
||||
from .base import Strategy
|
||||
|
||||
|
||||
class BasicStrategy(Strategy):
|
||||
pass
|
||||
Reference in New Issue
Block a user