From 56cb4138b570960a592ff82afbee0d4f6c015d90 Mon Sep 17 00:00:00 2001 From: Alexey Skobkin Date: Sun, 10 Jul 2022 13:22:44 +0300 Subject: [PATCH] Sanitizing HTML to leave only HTML tags allowed by Telegram Bot API. (#33) Reviewed-on: https://git.skobk.in/Miroslavsckaya/tg_rss_bot/pulls/33 Reviewed-by: Miroslavsckaya Co-authored-by: Alexey Skobkin Co-committed-by: Alexey Skobkin --- requirements.txt | 3 +++ telegram.py | 20 ++++++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 46d6009..ec28d3f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +bleach==5.0.1 certifi==2021.10.8 charset-normalizer==2.0.12 decorator==5.1.1 @@ -7,5 +8,7 @@ pyTelegramBotAPI==4.5.0 python-dotenv==0.20.0 requests==2.27.1 sgmllib3k==1.0.0 +six==1.16.0 urllib3==1.26.9 validators==0.19.0 +webencodings==0.5.1 diff --git a/telegram.py b/telegram.py index 6ed57d2..aecb49e 100644 --- a/telegram.py +++ b/telegram.py @@ -1,5 +1,6 @@ import time +from bleach.sanitizer import Cleaner from telebot import TeleBot from telebot.handler_backends import BaseMiddleware from telebot.types import Message @@ -32,6 +33,7 @@ class CommandProcessor: self.bot.infinity_polling() def __command_help(self, message: Message, data: dict): + # pylint: disable=unused-argument self.bot.reply_to( message, 'Supported commands:\n' @@ -96,6 +98,12 @@ class Notifier: def __init__(self, token: str): self.bot: TeleBot = TeleBot(token) + self.html_sanitizer: Cleaner = Cleaner( + tags=['b', 'strong', 'i', 'em', 'u', 'ins', 's', 'strike', 'del', 'span', 'tg-spoiler', 'a', 'code', 'pre'], + attributes={"a": ["href", "title"]}, + protocols=['http', 'https'], + strip=True, + ) def send_updates(self, chat_ids: list[int], updates: list[FeedItem], feed_title: str): """Send notification about new items to the user""" @@ -127,14 +135,18 @@ class Notifier: self.sent_counter = 0 self.sent_counter += 1 - @staticmethod - def __format_message(item: FeedItem) -> str: + def __format_message(self, item: FeedItem) -> str: return ( + # TODO: Return date when FeedItem starts to return formattable datetime object f"{item.title}\n\n" - f"{item.date}\n" - # f"{item.description}" + f"{self.__sanitize_html(item.description)}" ) + def __sanitize_html(self, html: str) -> str: + if not html: + return '' + return self.html_sanitizer.clean(html) + class UserAuthMiddleware(BaseMiddleware): """Transparently authenticates and registers the user if needed."""