youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

buzzfeed.py (3655B)


      1 # coding: utf-8
      2 from __future__ import unicode_literals
      3 
      4 import json
      5 import re
      6 
      7 from .common import InfoExtractor
      8 from .facebook import FacebookIE
      9 
     10 
     11 class BuzzFeedIE(InfoExtractor):
     12     _VALID_URL = r'https?://(?:www\.)?buzzfeed\.com/[^?#]*?/(?P<id>[^?#]+)'
     13     _TESTS = [{
     14         'url': 'http://www.buzzfeed.com/abagg/this-angry-ram-destroys-a-punching-bag-like-a-boss?utm_term=4ldqpia',
     15         'info_dict': {
     16             'id': 'this-angry-ram-destroys-a-punching-bag-like-a-boss',
     17             'title': 'This Angry Ram Destroys A Punching Bag Like A Boss',
     18             'description': 'Rambro!',
     19         },
     20         'playlist': [{
     21             'info_dict': {
     22                 'id': 'aVCR29aE_OQ',
     23                 'ext': 'mp4',
     24                 'title': 'Angry Ram destroys a punching bag..',
     25                 'description': 'md5:c59533190ef23fd4458a5e8c8c872345',
     26                 'upload_date': '20141024',
     27                 'uploader_id': 'Buddhanz1',
     28                 'uploader': 'Angry Ram',
     29             }
     30         }]
     31     }, {
     32         'url': 'http://www.buzzfeed.com/sheridanwatson/look-at-this-cute-dog-omg?utm_term=4ldqpia',
     33         'params': {
     34             'skip_download': True,  # Got enough YouTube download tests
     35         },
     36         'info_dict': {
     37             'id': 'look-at-this-cute-dog-omg',
     38             'description': 're:Munchkin the Teddy Bear is back ?!',
     39             'title': 'You Need To Stop What You\'re Doing And Watching This Dog Walk On A Treadmill',
     40         },
     41         'playlist': [{
     42             'info_dict': {
     43                 'id': 'mVmBL8B-In0',
     44                 'ext': 'mp4',
     45                 'title': 're:Munchkin the Teddy Bear gets her exercise',
     46                 'description': 'md5:28faab95cda6e361bcff06ec12fc21d8',
     47                 'upload_date': '20141124',
     48                 'uploader_id': 'CindysMunchkin',
     49                 'uploader': 're:^Munchkin the',
     50             },
     51         }]
     52     }, {
     53         'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK',
     54         'info_dict': {
     55             'id': 'the-most-adorable-crash-landing-ever',
     56             'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing',
     57             'description': 'This gosling knows how to stick a landing.',
     58         },
     59         'playlist': [{
     60             'md5': '763ca415512f91ca62e4621086900a23',
     61             'info_dict': {
     62                 'id': '971793786185728',
     63                 'ext': 'mp4',
     64                 'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...',
     65                 'uploader': 'Calgary Outdoor Centre-University of Calgary',
     66             },
     67         }],
     68         'add_ie': ['Facebook'],
     69     }]
     70 
     71     def _real_extract(self, url):
     72         playlist_id = self._match_id(url)
     73         webpage = self._download_webpage(url, playlist_id)
     74 
     75         all_buckets = re.findall(
     76             r'(?s)<div class="video-embed[^"]*"..*?rel:bf_bucket_data=\'([^\']+)\'',
     77             webpage)
     78 
     79         entries = []
     80         for bd_json in all_buckets:
     81             bd = json.loads(bd_json)
     82             video = bd.get('video') or bd.get('progload_video')
     83             if not video:
     84                 continue
     85             entries.append(self.url_result(video['url']))
     86 
     87         facebook_urls = FacebookIE._extract_urls(webpage)
     88         entries.extend([
     89             self.url_result(facebook_url)
     90             for facebook_url in facebook_urls])
     91 
     92         return {
     93             '_type': 'playlist',
     94             'id': playlist_id,
     95             'title': self._og_search_title(webpage),
     96             'description': self._og_search_description(webpage),
     97             'entries': entries,
     98         }