From: Thorsten Date: Sat, 11 Apr 2020 14:41:12 +0000 (+0200) Subject: fix youtube title scraping X-Git-Url: https://git.aero2k.de/?a=commitdiff_plain;h=feb65cb7545891c8994e66612a434d2894ce6abc;p=urlbot-v3.git fix youtube title scraping --- diff --git a/distbot/minijobber/run.py b/distbot/minijobber/run.py index 4638d4d..f3ecccb 100644 --- a/distbot/minijobber/run.py +++ b/distbot/minijobber/run.py @@ -12,7 +12,7 @@ from distbot.plugins import ( basic, fun, lookup, url, feeds, muc, translation, searx, queue_management, plugin_help, morse, meta, extended, bugtracker, bots, bofh, didyouknow, - debug + debug, youtube ) logger = logging.getLogger(__name__) @@ -36,7 +36,8 @@ PLUGIN_MODULES = { searx: searx.ALL, translation: translation.ALL, url: url.ALL, - didyouknow: didyouknow.ALL + didyouknow: didyouknow.ALL, + youtube: youtube.ALL, # debug: debug.ALL } job_workers = [] diff --git a/distbot/plugins/youtube.py b/distbot/plugins/youtube.py new file mode 100644 index 0000000..c0db98a --- /dev/null +++ b/distbot/plugins/youtube.py @@ -0,0 +1,52 @@ +import re + +import requests + +from distbot.bot.worker import Worker +from distbot.common.action import Action + + +class Youtube(Worker): + """ + # approach 1: https://www.googleapis.com/youtube/v3/videos?part=snippet&id={YOUTUBE_VIDEO_ID}&fields=items(id,snippet)&key={YOUR_API_KEY} + # approach 2 (without key): https://www.youtube.com/oembed?url=http://youtube.com/watch?v={YOUTUBE_VIDEO_ID}&format=json + """ + binding_keys = [ + "*.youtube.com.*", "*.youtu.be.*", + "*.youtube.com.*.nospoiler.*", "*.youtu.be.*.nospoiler.*", + ] + description = "resolves titles of posted youtube URLs" + + URL_TEMPLATE = "https://www.youtube.com/oembed?url=http://youtube.com/watch?v={YOUTUBE_VIDEO_ID}&format=json" + + @staticmethod + def get_youtube_id_from_url(body): + # TODO: what about multiple urls? ignore for now... + regex = r'https?://(www\.)?((youtube\.[a-z]+/watch\?v=)|(youtu.be/))(?P[^&?#\) ]+)' + regex = re.compile(regex) + result = regex.search(body) + if not result: + return None + else: + return result.groupdict().get("youtubeid", None) + + def resolve_title_from_id(self, youtube_id): + response = requests.get(self.URL_TEMPLATE.format(YOUTUBE_VIDEO_ID=youtube_id)) + response.raise_for_status() + return response.json().get("title") + + def parse_body(self, msg): + try: + youtube_id = self.get_youtube_id_from_url(msg["body"]) + if not youtube_id: + return None + title = self.resolve_title_from_id(youtube_id) + if not title: + return None + except requests.HTTPError: + return Action(msg="google has had an accident...") + + return Action(msg="YouTube: " + title) + + +ALL = [Youtube] diff --git a/tests/test_unit/test_youtube_plugin.py b/tests/test_unit/test_youtube_plugin.py new file mode 100644 index 0000000..cf53883 --- /dev/null +++ b/tests/test_unit/test_youtube_plugin.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +import pytest + +import distbot.plugins.youtube as youtube_plugin + + +@pytest.mark.parametrize( + argnames='body,youtubeid', + argvalues=[ + ("https://www.youtube.com/watch?v=H27VcmHVRaw", "H27VcmHVRaw"), + ("https://www.youtube.com/watch?v=H27VcmHVRaw#t=314", "H27VcmHVRaw"), + ("https://youtube.com/watch?v=H27VcmHVRaw?tracking=foo - xxx", "H27VcmHVRaw"), + ("https://youtu.be/H27VcmHVRaw", "H27VcmHVRaw"), + ("https://youtu.be/H27VcmHVRaw#t=314", "H27VcmHVRaw"), + ("blabla (https://www.youtube.com/watch?v=H27VcmHVRaw) - xxx", "H27VcmHVRaw"), + ] +) +def test_resolve_youtube_id(body, youtubeid): + resolved_id = youtube_plugin.Youtube.get_youtube_id_from_url(body) + assert youtubeid == resolved_id