[utils] add a function to clean podcast URLs
authorRemita Amine <remitamine@gmail.com>
Sun, 3 Jan 2021 23:51:55 +0000 (00:51 +0100)
committerRemita Amine <remitamine@gmail.com>
Mon, 4 Jan 2021 00:14:25 +0000 (01:14 +0100)
test/test_utils.py
youtube_dl/utils.py

index d49d3239c3013c25d40e7cbc99ff567e144b3942..259c4763e1ee7124bb4e7199cbe73491f322e281 100644 (file)
@@ -21,6 +21,7 @@ from youtube_dl.utils import (
     encode_base_n,
     caesar,
     clean_html,
+    clean_podcast_url,
     date_from_str,
     DateRange,
     detect_exe_version,
@@ -1470,6 +1471,10 @@ Line 1
         self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
         self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
 
+    def test_clean_podcast_url(self):
+        self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3')
+        self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3')
+
 
 if __name__ == '__main__':
     unittest.main()
index d5fb6fd24c3bbbe99ab8c843cc95e44f5326ee67..8e4d144c9ace42e6c8a5281f4e221d276e519901 100644 (file)
@@ -5706,3 +5706,20 @@ def random_birthday(year_field, month_field, day_field):
         month_field: str(random_date.month),
         day_field: str(random_date.day),
     }
+
+
+def clean_podcast_url(url):
+    return re.sub(r'''(?x)
+        (?:
+            (?:
+                chtbl\.com/track|
+                media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
+                play\.podtrac\.com
+            )/[^/]+|
+            (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
+            flex\.acast\.com|
+            pd(?:
+                cn\.co| # https://podcorn.com/analytics-prefix/
+                st\.fm # https://podsights.com/docs/
+            )/e
+        )/''', '', url)