Release v1.0.0
Some checks failed
CI / Checks (push) Failing after 13m2s

This commit is contained in:
2025-08-20 21:00:50 +02:00
commit b4338e2769
401 changed files with 23576 additions and 0 deletions

View File

@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
from django.apps import AppConfig
from django.core.exceptions import ImproperlyConfigured
from django.utils.translation import gettext_lazy as _
class BotConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
label = 'bot'
name = 'hotpocket_backend.apps.bot'
verbose_name = _('Bot')
def ready(self):
super().ready()
try:
from hotpocket_backend.apps.bot import conf
conf.bot_settings = conf.from_django_settings()
except Exception as exception:
raise ImproperlyConfigured('Invalid bot settings') from exception

View File

@@ -0,0 +1,54 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
import dataclasses
from .types import PStrategy
DEFAULT_STRATEGY = 'hotpocket_backend.apps.bot.strategy.basic:BasicStrategy'
DEFAULT_BANNED_HOSTNAMES = [
# YT returns dummy data when I try to fetch the page and extract
# metadata. I'd have to use Google APIs for that and it's 11:30 PM...
'youtube.com',
'youtu.be',
# Reddit's response is too generic to pull any useful info from it.
# Since they forced Apollo to shut down, I refuse to even think about
# interacting with their API :P.
'reddit.com',
# Twitter, amirite?
'twitter.com',
't.co',
'x.com',
]
@dataclasses.dataclass(kw_only=True)
class Settings:
STRATEGY: str
BANNED_HOSTNAMES: list[str]
def get_strategy(self, *, url: str) -> PStrategy:
from hotpocket_common.loader import load_module_attribute
strategy = load_module_attribute(self.STRATEGY)
return strategy(url)
def from_django_settings() -> Settings:
from django.conf import settings
return Settings(
STRATEGY=getattr(
settings,
'HOTPOCKET_BOT_STRATEGY',
DEFAULT_STRATEGY,
),
BANNED_HOSTNAMES=getattr(
settings,
'HOTPOCKET_BOT_BANNED_HOSTNAMES',
DEFAULT_BANNED_HOSTNAMES,
),
)
bot_settings: Settings = None # type: ignore[assignment]

View File

@@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
import pydantic
class FetchResult(pydantic.BaseModel):
status_code: int
content: bytes
content_type: str | None
encoding: str

View File

@@ -0,0 +1 @@
from .bot import BotService # noqa: F401

View File

@@ -0,0 +1,16 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
from hotpocket_backend.apps.bot.conf import bot_settings
from hotpocket_backend.apps.bot.types import PStrategy
from hotpocket_soa.dto import BotResultOut
class BotService:
def is_netloc_banned(self, *, url: str) -> bool:
strategy: PStrategy = bot_settings.get_strategy(url=url)
return strategy.is_netloc_banned()
def handle(self, *, url: str) -> BotResultOut:
strategy: PStrategy = bot_settings.get_strategy(url=url)
return strategy.run()

View File

@@ -0,0 +1 @@
from .basic import BasicStrategy # noqa: F401

View File

@@ -0,0 +1,174 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
import abc
import logging
from pyquery import PyQuery
import requests
from hotpocket_backend._meta import version as backend_version
from hotpocket_backend.apps.bot.conf import bot_settings
from hotpocket_backend.apps.bot.dto.strategy import FetchResult
from hotpocket_common.url import URL
from hotpocket_soa.dto import BotResultOut
LOGGER = logging.getLogger(__name__)
class Strategy(abc.ABC):
class StrategyError(Exception):
pass
class FetchError(StrategyError):
pass
class RuntimeError(StrategyError):
pass
USER_AGENT = (
'Mozilla/5.0 '
'('
'compatible; '
f'BTHLabsHotPocketBot/{backend_version}; '
'+https://hotpocket.app/bot.txt'
')'
)
TITLE_TAG_SELECTORS = [
'head > meta[name=title]',
'head > meta[property="og:title"]',
'head > title',
]
DESCRIPTION_TAG_SELECTORS = [
'head > meta[property="og:description"]',
'head > meta[name=description]',
]
def __init__(self, url: str):
super().__init__()
self.url = url
self.parsed_url = URL(self.url)
self.logger = self.get_logger()
def get_logger(self) -> logging.Logger:
return LOGGER.getChild(self.__class__.__name__)
def is_netloc_banned(self) -> bool:
result = False
for banned_netloc in bot_settings.BANNED_HOSTNAMES:
hostname = self.parsed_url.hostname
if hostname is not None and hostname.endswith(banned_netloc) is True:
result = True
break
return result
def fetch(self, url: str) -> FetchResult:
try:
response = requests.request(
'GET',
url,
headers={
'User-Agent': self.USER_AGENT,
},
)
response.raise_for_status()
return FetchResult.model_validate(dict(
status_code=response.status_code,
content=response.content,
content_type=response.headers.get('Content-Type', None),
encoding=response.encoding or response.apparent_encoding,
))
except Exception as exception:
self.logger.error(
'Fetch error: %s', exception, exc_info=True,
)
raise self.FetchError() from exception
def extract_title_and_description_from_html(self, content: str) -> tuple[str | None, str | None]:
dom = PyQuery(content)
title: str | None = None
description: str | None = None
for selector in self.TITLE_TAG_SELECTORS:
title_tags = dom.find(selector)
if len(title_tags) > 0:
title_tag = PyQuery(title_tags[0])
if title_tag.is_('meta'):
title = title_tag.attr('content')
else:
title = title_tag.text()
break
for selector in self.DESCRIPTION_TAG_SELECTORS:
description_tags = dom.find(selector)
if len(description_tags) > 0:
description = PyQuery(description_tags[0]).attr('content')
break
if description is None:
try:
description = PyQuery(dom.find('p')[0]).text()
except IndexError:
pass
return (
title.strip() or None
if title is not None
else None,
description.strip() or None
if description is not None
else None,
)
def run(self) -> BotResultOut:
result = BotResultOut.model_validate(dict(
title=None,
description=None,
is_netloc_banned=False,
))
result.is_netloc_banned = self.is_netloc_banned()
if result.is_netloc_banned is False:
fetch_result = self.fetch(self.url)
try:
assert fetch_result.content is not None, (
'Received empty content'
)
assert fetch_result.content_type is not None, (
'Unable to determine the content type'
)
assert fetch_result.content_type.startswith('text/html') is True, (
f'Unsupported content type: `{fetch_result.content_type}`'
)
except AssertionError as exception:
self.logger.error(
'Unprocessable fetch result: %s', exception, exc_info=exception,
)
raise self.RuntimeError(exception.args[0]) from exception
try:
decoded_content = fetch_result.content.decode(fetch_result.encoding)
title, description = self.extract_title_and_description_from_html(
decoded_content,
)
result.title = title
result.description = description
except Exception as exception:
self.logger.error(
'Processing error: %s', exception, exc_info=exception,
)
raise self.RuntimeError() from exception
else:
self.logger.debug('Skipping banned netloc: url=`%s`', self.url)
return result

View File

@@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
from .base import Strategy
class BasicStrategy(Strategy):
pass

View File

@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
import typing
from hotpocket_soa.dto import BotResultOut
class PStrategy(typing.Protocol):
def __init__(self, url: str):
...
def is_netloc_banned(self) -> bool:
...
def run(self) -> BotResultOut:
...