youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

slideshare.py (2132B)


      1 from __future__ import unicode_literals
      2 
      3 import re
      4 import json
      5 
      6 from .common import InfoExtractor
      7 from ..compat import (
      8     compat_urlparse,
      9 )
     10 from ..utils import (
     11     ExtractorError,
     12     get_element_by_id,
     13 )
     14 
     15 
     16 class SlideshareIE(InfoExtractor):
     17     _VALID_URL = r'https?://(?:www\.)?slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)'
     18 
     19     _TEST = {
     20         'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity',
     21         'info_dict': {
     22             'id': '25665706',
     23             'ext': 'mp4',
     24             'title': 'Managing Scale and Complexity',
     25             'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.',
     26         },
     27     }
     28 
     29     def _real_extract(self, url):
     30         mobj = re.match(self._VALID_URL, url)
     31         page_title = mobj.group('title')
     32         webpage = self._download_webpage(url, page_title)
     33         slideshare_obj = self._search_regex(
     34             r'\$\.extend\(.*?slideshare_object,\s*(\{.*?\})\);',
     35             webpage, 'slideshare object')
     36         info = json.loads(slideshare_obj)
     37         if info['slideshow']['type'] != 'video':
     38             raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True)
     39 
     40         doc = info['doc']
     41         bucket = info['jsplayer']['video_bucket']
     42         ext = info['jsplayer']['video_extension']
     43         video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
     44         description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex(
     45             r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage,
     46             'description', fatal=False)
     47 
     48         return {
     49             '_type': 'video',
     50             'id': info['slideshow']['id'],
     51             'title': info['slideshow']['title'],
     52             'ext': ext,
     53             'url': video_url,
     54             'thumbnail': info['slideshow']['pin_image_url'],
     55             'description': description.strip() if description else None,
     56         }