Fix #29. Sanitizing HTML to leave only HTML tags allowed by Telegram Bot API.
Some checks failed
continuous-integration/drone/push Build is passing
continuous-integration/drone/pr Build is failing

This commit is contained in:
Alexey Skobkin 2022-07-09 23:13:53 +03:00
parent 7c373c8f78
commit ecc97fa2a0
No known key found for this signature in database
GPG key ID: 5D5CEF6F221278E7
2 changed files with 19 additions and 4 deletions

View file

@ -1,3 +1,4 @@
bleach==5.0.1
certifi==2021.10.8 certifi==2021.10.8
charset-normalizer==2.0.12 charset-normalizer==2.0.12
decorator==5.1.1 decorator==5.1.1
@ -7,5 +8,7 @@ pyTelegramBotAPI==4.5.0
python-dotenv==0.20.0 python-dotenv==0.20.0
requests==2.27.1 requests==2.27.1
sgmllib3k==1.0.0 sgmllib3k==1.0.0
six==1.16.0
urllib3==1.26.9 urllib3==1.26.9
validators==0.19.0 validators==0.19.0
webencodings==0.5.1

View file

@ -1,5 +1,6 @@
import time import time
from bleach.sanitizer import Cleaner
from telebot import TeleBot from telebot import TeleBot
from telebot.handler_backends import BaseMiddleware from telebot.handler_backends import BaseMiddleware
from telebot.types import Message from telebot.types import Message
@ -96,6 +97,12 @@ class Notifier:
def __init__(self, token: str): def __init__(self, token: str):
self.bot: TeleBot = TeleBot(token) self.bot: TeleBot = TeleBot(token)
self.html_sanitizer: Cleaner = Cleaner(
tags=['b', 'strong', 'i', 'em', 'u', 'ins', 's', 'strike', 'del', 'span', 'tg-spoiler', 'a', 'code', 'pre'],
attributes={"a": ["href", "title"]},
protocols=['http', 'https'],
strip=True,
)
def send_updates(self, chat_ids: list[int], updates: list[FeedItem], feed_title: str): def send_updates(self, chat_ids: list[int], updates: list[FeedItem], feed_title: str):
"""Send notification about new items to the user""" """Send notification about new items to the user"""
@ -127,14 +134,19 @@ class Notifier:
self.sent_counter = 0 self.sent_counter = 0
self.sent_counter += 1 self.sent_counter += 1
@staticmethod def __format_message(self, item: FeedItem) -> str:
def __format_message(item: FeedItem) -> str:
return ( return (
f"<strong><a href=\"{item.url}\">{item.title}</a></strong>\n\n" f"<strong><a href=\"{item.url}\">{item.title}</a></strong>\n\n"
f"{item.date}\n" # TODO: format properly when FeedItem starts to return proper datetime object
# f"{item.description}" #f"{item.date}\n"
f"{self.__sanitize_html(item.description)}"
) )
def __sanitize_html(self, html: str) -> str:
if not html:
return ''
return self.html_sanitizer.clean(html)
class UserAuthMiddleware(BaseMiddleware): class UserAuthMiddleware(BaseMiddleware):
"""Transparently authenticates and registers the user if needed.""" """Transparently authenticates and registers the user if needed."""