]> git.aero2k.de Git - urlbot-v3.git/commitdiff
fix youtube title scraping
authorThorsten <mail@aero2k.de>
Sat, 11 Apr 2020 14:41:12 +0000 (16:41 +0200)
committerThorsten <mail@aero2k.de>
Sat, 11 Apr 2020 14:41:12 +0000 (16:41 +0200)
distbot/minijobber/run.py
distbot/plugins/youtube.py [new file with mode: 0644]
tests/test_unit/test_youtube_plugin.py [new file with mode: 0644]

index 4638d4de1b24bbd409ce3759fb05bdb1162fb760..f3ecccb3e79bbbe4e051ff7d96618dd49e28f9a8 100644 (file)
@@ -12,7 +12,7 @@ from distbot.plugins import (
     basic, fun, lookup, url, feeds, muc, translation, searx, queue_management, plugin_help,
     morse, meta,
     extended, bugtracker, bots, bofh, didyouknow,
-    debug
+    debug, youtube
 )
 logger = logging.getLogger(__name__)
 
@@ -36,7 +36,8 @@ PLUGIN_MODULES = {
     searx: searx.ALL,
     translation: translation.ALL,
     url: url.ALL,
-    didyouknow: didyouknow.ALL
+    didyouknow: didyouknow.ALL,
+    youtube: youtube.ALL,
     # debug: debug.ALL
 }
 job_workers = []
diff --git a/distbot/plugins/youtube.py b/distbot/plugins/youtube.py
new file mode 100644 (file)
index 0000000..c0db98a
--- /dev/null
@@ -0,0 +1,52 @@
+import re
+
+import requests
+
+from distbot.bot.worker import Worker
+from distbot.common.action import Action
+
+
+class Youtube(Worker):
+    """
+    # approach 1: https://www.googleapis.com/youtube/v3/videos?part=snippet&id={YOUTUBE_VIDEO_ID}&fields=items(id,snippet)&key={YOUR_API_KEY}
+    # approach 2 (without key): https://www.youtube.com/oembed?url=http://youtube.com/watch?v={YOUTUBE_VIDEO_ID}&format=json
+    """
+    binding_keys = [
+        "*.youtube.com.*", "*.youtu.be.*",
+        "*.youtube.com.*.nospoiler.*", "*.youtu.be.*.nospoiler.*",
+    ]
+    description = "resolves titles of posted youtube URLs"
+
+    URL_TEMPLATE = "https://www.youtube.com/oembed?url=http://youtube.com/watch?v={YOUTUBE_VIDEO_ID}&format=json"
+
+    @staticmethod
+    def get_youtube_id_from_url(body):
+        # TODO: what about multiple urls? ignore for now...
+        regex = r'https?://(www\.)?((youtube\.[a-z]+/watch\?v=)|(youtu.be/))(?P<youtubeid>[^&?#\) ]+)'
+        regex = re.compile(regex)
+        result = regex.search(body)
+        if not result:
+            return None
+        else:
+            return result.groupdict().get("youtubeid", None)
+
+    def resolve_title_from_id(self, youtube_id):
+        response = requests.get(self.URL_TEMPLATE.format(YOUTUBE_VIDEO_ID=youtube_id))
+        response.raise_for_status()
+        return response.json().get("title")
+
+    def parse_body(self, msg):
+        try:
+            youtube_id = self.get_youtube_id_from_url(msg["body"])
+            if not youtube_id:
+                return None
+            title = self.resolve_title_from_id(youtube_id)
+            if not title:
+                return None
+        except requests.HTTPError:
+            return Action(msg="google has had an accident...")
+
+        return Action(msg="YouTube: " + title)
+
+
+ALL = [Youtube]
diff --git a/tests/test_unit/test_youtube_plugin.py b/tests/test_unit/test_youtube_plugin.py
new file mode 100644 (file)
index 0000000..cf53883
--- /dev/null
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+import distbot.plugins.youtube as youtube_plugin
+
+
+@pytest.mark.parametrize(
+    argnames='body,youtubeid',
+    argvalues=[
+        ("https://www.youtube.com/watch?v=H27VcmHVRaw", "H27VcmHVRaw"),
+        ("https://www.youtube.com/watch?v=H27VcmHVRaw#t=314", "H27VcmHVRaw"),
+        ("https://youtube.com/watch?v=H27VcmHVRaw?tracking=foo - xxx", "H27VcmHVRaw"),
+        ("https://youtu.be/H27VcmHVRaw", "H27VcmHVRaw"),
+        ("https://youtu.be/H27VcmHVRaw#t=314", "H27VcmHVRaw"),
+        ("blabla (https://www.youtube.com/watch?v=H27VcmHVRaw) - xxx", "H27VcmHVRaw"),
+    ]
+)
+def test_resolve_youtube_id(body, youtubeid):
+    resolved_id = youtube_plugin.Youtube.get_youtube_id_from_url(body)
+    assert youtubeid == resolved_id