Sanitizing HTML to leave only HTML tags allowed by Telegram Bot API. #33

Merged
Miroslavsckaya merged 3 commits from feature_filter_html_tags into master 2022-07-10 10:22:45 +00:00
2 changed files with 19 additions and 4 deletions

View File

@ -1,3 +1,4 @@
bleach==5.0.1
certifi==2021.10.8
charset-normalizer==2.0.12
decorator==5.1.1
@ -7,5 +8,7 @@ pyTelegramBotAPI==4.5.0
python-dotenv==0.20.0
requests==2.27.1
sgmllib3k==1.0.0
six==1.16.0
urllib3==1.26.9
validators==0.19.0
webencodings==0.5.1

View File

@ -1,5 +1,6 @@
import time
from bleach.sanitizer import Cleaner
from telebot import TeleBot
from telebot.handler_backends import BaseMiddleware
from telebot.types import Message
@ -32,6 +33,7 @@ class CommandProcessor:
self.bot.infinity_polling()
def __command_help(self, message: Message, data: dict):
# pylint: disable=unused-argument
self.bot.reply_to(
message,
'Supported commands:\n'
@ -96,6 +98,12 @@ class Notifier:
def __init__(self, token: str):
self.bot: TeleBot = TeleBot(token)
self.html_sanitizer: Cleaner = Cleaner(
tags=['b', 'strong', 'i', 'em', 'u', 'ins', 's', 'strike', 'del', 'span', 'tg-spoiler', 'a', 'code', 'pre'],
attributes={"a": ["href", "title"]},
protocols=['http', 'https'],
strip=True,
)
def send_updates(self, chat_ids: list[int], updates: list[FeedItem], feed_title: str):
"""Send notification about new items to the user"""
@ -127,14 +135,18 @@ class Notifier:
self.sent_counter = 0
self.sent_counter += 1
@staticmethod
def __format_message(item: FeedItem) -> str:
def __format_message(self, item: FeedItem) -> str:
return (
# TODO: Return date when FeedItem starts to return formattable datetime object
f"<strong><a href=\"{item.url}\">{item.title}</a></strong>\n\n"
f"{item.date}\n"
# f"{item.description}"
f"{self.__sanitize_html(item.description)}"
)
def __sanitize_html(self, html: str) -> str:
if not html:
return ''
return self.html_sanitizer.clean(html)
class UserAuthMiddleware(BaseMiddleware):
"""Transparently authenticates and registers the user if needed."""