xtube.py - youtube-dl - Another place where youtube-dl lives on

xtube.py (8579B)
      1 from __future__ import unicode_literals
      2 
      3 import itertools
      4 import re
      5 
      6 from .common import InfoExtractor
      7 from ..utils import (
      8     int_or_none,
      9     js_to_json,
     10     orderedSet,
     11     parse_duration,
     12     sanitized_Request,
     13     str_to_int,
     14     url_or_none,
     15 )
     16 
     17 
     18 class XTubeIE(InfoExtractor):
     19     _VALID_URL = r'''(?x)
     20                         (?:
     21                             xtube:|
     22                             https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?:embedded/)?(?P<display_id>[^/]+)-)
     23                         )
     24                         (?P<id>[^/?&#]+)
     25                     '''
     26 
     27     _TESTS = [{
     28         # old URL schema
     29         'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_',
     30         'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab',
     31         'info_dict': {
     32             'id': 'kVTUy_G222_',
     33             'ext': 'mp4',
     34             'title': 'strange erotica',
     35             'description': 'contains:an ET kind of thing',
     36             'uploader': 'greenshowers',
     37             'duration': 450,
     38             'view_count': int,
     39             'comment_count': int,
     40             'age_limit': 18,
     41         }
     42     }, {
     43         # FLV videos with duplicated formats
     44         'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752',
     45         'md5': 'a406963eb349dd43692ec54631efd88b',
     46         'info_dict': {
     47             'id': '9299752',
     48             'display_id': 'A-Super-Run-Part-1-YT',
     49             'ext': 'flv',
     50             'title': 'A Super Run - Part 1 (YT)',
     51             'description': 'md5:4cc3af1aa1b0413289babc88f0d4f616',
     52             'uploader': 'tshirtguy59',
     53             'duration': 579,
     54             'view_count': int,
     55             'comment_count': int,
     56             'age_limit': 18,
     57         },
     58     }, {
     59         # new URL schema
     60         'url': 'http://www.xtube.com/video-watch/strange-erotica-625837',
     61         'only_matching': True,
     62     }, {
     63         'url': 'xtube:625837',
     64         'only_matching': True,
     65     }, {
     66         'url': 'xtube:kVTUy_G222_',
     67         'only_matching': True,
     68     }, {
     69         'url': 'https://www.xtube.com/video-watch/embedded/milf-tara-and-teen-shared-and-cum-covered-extreme-bukkake-32203482?embedsize=big',
     70         'only_matching': True,
     71     }]
     72 
     73     def _real_extract(self, url):
     74         mobj = re.match(self._VALID_URL, url)
     75         video_id = mobj.group('id')
     76         display_id = mobj.group('display_id')
     77 
     78         if not display_id:
     79             display_id = video_id
     80 
     81         if video_id.isdigit() and len(video_id) < 11:
     82             url_pattern = 'http://www.xtube.com/video-watch/-%s'
     83         else:
     84             url_pattern = 'http://www.xtube.com/watch.php?v=%s'
     85 
     86         webpage = self._download_webpage(
     87             url_pattern % video_id, display_id, headers={
     88                 'Cookie': 'age_verified=1; cookiesAccepted=1',
     89             })
     90 
     91         title, thumbnail, duration, sources, media_definition = [None] * 5
     92 
     93         config = self._parse_json(self._search_regex(
     94             r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf|playerWrapper)', webpage, 'config',
     95             default='{}'), video_id, transform_source=js_to_json, fatal=False)
     96         if config:
     97             config = config.get('mainRoll')
     98             if isinstance(config, dict):
     99                 title = config.get('title')
    100                 thumbnail = config.get('poster')
    101                 duration = int_or_none(config.get('duration'))
    102                 sources = config.get('sources') or config.get('format')
    103                 media_definition = config.get('mediaDefinition')
    104 
    105         if not isinstance(sources, dict) and not media_definition:
    106             sources = self._parse_json(self._search_regex(
    107                 r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),',
    108                 webpage, 'sources', group='sources'), video_id,
    109                 transform_source=js_to_json)
    110 
    111         formats = []
    112         format_urls = set()
    113 
    114         if isinstance(sources, dict):
    115             for format_id, format_url in sources.items():
    116                 format_url = url_or_none(format_url)
    117                 if not format_url:
    118                     continue
    119                 if format_url in format_urls:
    120                     continue
    121                 format_urls.add(format_url)
    122                 formats.append({
    123                     'url': format_url,
    124                     'format_id': format_id,
    125                     'height': int_or_none(format_id),
    126                 })
    127 
    128         if isinstance(media_definition, list):
    129             for media in media_definition:
    130                 video_url = url_or_none(media.get('videoUrl'))
    131                 if not video_url:
    132                     continue
    133                 if video_url in format_urls:
    134                     continue
    135                 format_urls.add(video_url)
    136                 format_id = media.get('format')
    137                 if format_id == 'hls':
    138                     formats.extend(self._extract_m3u8_formats(
    139                         video_url, video_id, 'mp4', entry_protocol='m3u8_native',
    140                         m3u8_id='hls', fatal=False))
    141                 elif format_id == 'mp4':
    142                     height = int_or_none(media.get('quality'))
    143                     formats.append({
    144                         'url': video_url,
    145                         'format_id': '%s-%d' % (format_id, height) if height else format_id,
    146                         'height': height,
    147                     })
    148 
    149         self._remove_duplicate_formats(formats)
    150         self._sort_formats(formats)
    151 
    152         if not title:
    153             title = self._search_regex(
    154                 (r'<h1>\s*(?P<title>[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'),
    155                 webpage, 'title', group='title')
    156         description = self._og_search_description(
    157             webpage, default=None) or self._html_search_meta(
    158             'twitter:description', webpage, default=None) or self._search_regex(
    159             r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False)
    160         uploader = self._search_regex(
    161             (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"',
    162              r'<span[^>]+class="nickname"[^>]*>([^<]+)'),
    163             webpage, 'uploader', fatal=False)
    164         if not duration:
    165             duration = parse_duration(self._search_regex(
    166                 r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>',
    167                 webpage, 'duration', fatal=False))
    168         view_count = str_to_int(self._search_regex(
    169             (r'["\']viewsCount["\'][^>]*>(\d+)\s+views',
    170              r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>'),
    171             webpage, 'view count', fatal=False))
    172         comment_count = str_to_int(self._html_search_regex(
    173             r'>Comments? \(([\d,\.]+)\)<',
    174             webpage, 'comment count', fatal=False))
    175 
    176         return {
    177             'id': video_id,
    178             'display_id': display_id,
    179             'title': title,
    180             'description': description,
    181             'thumbnail': thumbnail,
    182             'uploader': uploader,
    183             'duration': duration,
    184             'view_count': view_count,
    185             'comment_count': comment_count,
    186             'age_limit': 18,
    187             'formats': formats,
    188         }
    189 
    190 
    191 class XTubeUserIE(InfoExtractor):
    192     IE_DESC = 'XTube user profile'
    193     _VALID_URL = r'https?://(?:www\.)?xtube\.com/profile/(?P<id>[^/]+-\d+)'
    194     _TEST = {
    195         'url': 'http://www.xtube.com/profile/greenshowers-4056496',
    196         'info_dict': {
    197             'id': 'greenshowers-4056496',
    198             'age_limit': 18,
    199         },
    200         'playlist_mincount': 154,
    201     }
    202 
    203     def _real_extract(self, url):
    204         user_id = self._match_id(url)
    205 
    206         entries = []
    207         for pagenum in itertools.count(1):
    208             request = sanitized_Request(
    209                 'http://www.xtube.com/profile/%s/videos/%d' % (user_id, pagenum),
    210                 headers={
    211                     'Cookie': 'popunder=4',
    212                     'X-Requested-With': 'XMLHttpRequest',
    213                     'Referer': url,
    214                 })
    215 
    216             page = self._download_json(
    217                 request, user_id, 'Downloading videos JSON page %d' % pagenum)
    218 
    219             html = page.get('html')
    220             if not html:
    221                 break
    222 
    223             for video_id in orderedSet([video_id for _, video_id in re.findall(
    224                     r'data-plid=(["\'])(.+?)\1', html)]):
    225                 entries.append(self.url_result('xtube:%s' % video_id, XTubeIE.ie_key()))
    226 
    227             page_count = int_or_none(page.get('pageCount'))
    228             if not page_count or pagenum == page_count:
    229                 break
    230 
    231         playlist = self.playlist_result(entries, user_id)
    232         playlist['age_limit'] = 18
    233         return playlist
	youtube-dl Another place where youtube-dl lives on
	git clone git://git.oshgnacknak.de/youtube-dl.git
	Log \| Files \| Refs \| README \| LICENSE