[younow] Fix issues and improve extraction (closes #9255, closes #9432, closes #12436) - youtube-dl

commit eb4b5818e2a297bd001eb1b4962d709b1245fd24
parent 47a8587915668ef82632a7a75f8bc9862679623a
Author: Sergey M․ <dstftw@gmail.com>
Date:   Sun, 29 Oct 2017 04:16:07 +0700

[younow] Fix issues and improve extraction (closes #9255, closes #9432, closes #12436)

Diffstat:
M youtube_dl/extractor/extractors.py  | 2 +-
M youtube_dl/extractor/younow.py  | 225 ++++++++++++++++++++++++++++++++++++++++---------------------------------------

2 files changed, 116 insertions(+), 111 deletions(-)
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
@@ -1336,7 +1336,7 @@ from .youku import (
     YoukuShowIE,
 )
 from .younow import (
-    YouNowIE,
+    YouNowLiveIE,
     YouNowChannelIE,
     YouNowMomentIE,
 )
diff --git a/youtube_dl/extractor/younow.py b/youtube_dl/extractor/younow.py
@@ -1,17 +1,22 @@
 # coding: utf-8
 from __future__ import unicode_literals
-from datetime import date, datetime
+
+import itertools
 
 from .common import InfoExtractor
 from ..compat import compat_str
-from ..utils import int_or_none, UnsupportedError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    try_get,
+)
 
-MOMENT_URL_FORMAT = 'https://cdn.younow.com/php/api/moment/fetch/id=%s'
-STREAM_URL_FORMAT = 'https://hls.younow.com/momentsplaylists/live/%s/%s.m3u8'
+CDN_API_BASE = 'https://cdn.younow.com/php/api'
+MOMENT_URL_FORMAT = '%s/moment/fetch/id=%%s' % CDN_API_BASE
 
 
-class YouNowIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P<id>[^/]+)'
+class YouNowLiveIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P<id>[^/?#&]+)'
     _TEST = {
         'url': 'https://www.younow.com/AmandaPadeezy',
         'info_dict': {
@@ -19,179 +24,179 @@ class YouNowIE(InfoExtractor):
             'ext': 'mp4',
             'is_live': True,
             'title': 'March 26, 2017',
-            'description': 'YouNow is the best way to broadcast live and get an audience to watch you.',
-            'thumbnail': 'https://ynassets.s3.amazonaws.com/broadcast/live/157869188/157869188.jpg',
+            'thumbnail': r're:^https?://.*\.jpg$',
             'tags': ['girls'],
             'categories': ['girls'],
             'uploader': 'AmandaPadeezy',
             'uploader_id': '6716501',
             'uploader_url': 'https://www.younow.com/AmandaPadeezy',
             'creator': 'AmandaPadeezy',
-            'formats': [{
-                'url': 'https://cdn.younow.com/php/api/broadcast/videoPath/hls=1/broadcastId=157869188/channelId=6716501',
-                'ext': 'mp4',
-                'protocol': 'm3u8',
-            }],
-        }
+        },
+        'skip': True,
     }
 
+    @classmethod
+    def suitable(cls, url):
+        return (False
+                if YouNowChannelIE.suitable(url) or YouNowMomentIE.suitable(url)
+                else super(YouNowLiveIE, cls).suitable(url))
+
     def _real_extract(self, url):
         username = self._match_id(url)
-        data = self._download_json('https://api.younow.com/php/api/broadcast/info/curId=0/user=%s' % (username), username)
-
-        if data.get('media'):
-            stream_url = 'https://cdn.younow.com/php/api/broadcast/videoPath/hls=1/broadcastId=%s/channelId=%s' % (
-                data.get('broadcastId'),
-                data.get('userId'),
-            )
-        else:
-            raise UnsupportedError('Unsupported stream or user is not streaming at this time')
-
-        webpage = self._download_webpage(url, username)
-        try:
-            uploader = data['user']['profileUrlString']
-        except KeyError:
-            uploader = username
-        try:
-            title = data['title']
-        except KeyError:
-            title = date.today().strftime('%B %d, %Y')
+
+        data = self._download_json(
+            'https://api.younow.com/php/api/broadcast/info/curId=0/user=%s'
+            % username, username)
+
+        if data.get('errorCode') != 0:
+            raise ExtractorError(data['errorMsg'], expected=True)
+
+        uploader = try_get(
+            data, lambda x: x['user']['profileUrlString'],
+            compat_str) or username
 
         return {
             'id': uploader,
             'is_live': True,
-            'title': title,
-            'description': self._og_search_description(webpage),
+            'title': self._live_title(uploader),
             'thumbnail': data.get('awsUrl'),
             'tags': data.get('tags'),
             'categories': data.get('tags'),
             'uploader': uploader,
             'uploader_id': data.get('userId'),
-            'uploader_url': 'https://www.younow.com/%s' % (data['user']['profileUrlString'],),
+            'uploader_url': 'https://www.younow.com/%s' % username,
             'creator': uploader,
             'view_count': int_or_none(data.get('viewers')),
             'like_count': int_or_none(data.get('likes')),
             'formats': [{
-                'url': stream_url,
+                'url': '%s/broadcast/videoPath/hls=1/broadcastId=%s/channelId=%s'
+                       % (CDN_API_BASE, data['broadcastId'], data['userId']),
                 'ext': 'mp4',
                 'protocol': 'm3u8',
             }],
         }
 
 
-def _moment_to_entry(item):
+def _extract_moment(item, fatal=True):
+    moment_id = item.get('momentId')
+    if not moment_id:
+        if not fatal:
+            return
+        raise ExtractorError('Unable to extract moment id')
+
+    moment_id = compat_str(moment_id)
+
     title = item.get('text')
-    title_type = item.get('titleType')
     if not title:
-        if title_type:
-            title = 'YouNow %s' % item.get('titleType')
-        else:
-            title = 'YouNow moment'
+        title = 'YouNow %s' % (
+            item.get('momentType') or item.get('titleType') or 'moment')
+
+    uploader = try_get(item, lambda x: x['owner']['name'], compat_str)
+    uploader_id = try_get(item, lambda x: x['owner']['userId'])
+    uploader_url = 'https://www.younow.com/%s' % uploader if uploader else None
 
     entry = {
-        'id': compat_str(item['momentId']),
+        'extractor_key': 'YouNowMoment',
+        'id': moment_id,
         'title': title,
         'view_count': int_or_none(item.get('views')),
         'like_count': int_or_none(item.get('likes')),
         'timestamp': int_or_none(item.get('created')),
+        'creator': uploader,
+        'uploader': uploader,
+        'uploader_id': uploader_id,
+        'uploader_url': uploader_url,
         'formats': [{
-            'url': STREAM_URL_FORMAT % (item['momentId'], item['momentId']),
+            'url': 'https://hls.younow.com/momentsplaylists/live/%s/%s.m3u8'
+                   % (moment_id, moment_id),
             'ext': 'mp4',
-            'protocol': 'm3u8',
+            'protocol': 'm3u8_native',
         }],
     }
 
-    try:
-        entry['uploader'] = entry['creator'] = item['owner']['name']
-        entry['uploader_url'] = 'https://www.younow.com/%s' % (item['owner']['name'],)
-        entry['uploader_id'] = item['owner']['userId']
-    except KeyError:
-        pass
-
     return entry
 
 
 class YouNowChannelIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P<id>[^/]+)/channel'
     _TEST = {
-        'url': 'https://www.younow.com/Kate_Swiz/channel',
+        'url': 'https://www.younow.com/its_Kateee_/channel',
         'info_dict': {
-            'title': 'Kate_Swiz moments'
+            'id': '14629760',
+            'title': 'its_Kateee_ moments'
         },
-        'playlist_count': 6,
+        'playlist_mincount': 8,
     }
 
-    MOMENTS_URL_FORMAT = 'https://cdn.younow.com/php/api/moment/profile/channelId=%s/createdBefore=%d/records=20'
-
-    def _real_extract(self, url):
-        entries = []
-        username = self._match_id(url)
-        user_info = self._download_json('https://api.younow.com/php/api/broadcast/info/curId=0/user=%s' % (username), username, note='Downloading user information')
-        channel_id = user_info['userId']
+    def _entries(self, username, channel_id):
         created_before = 0
-        moment_ids = []
-        moment_ids_processed = []
-        err = False
-
-        while True:
-            if created_before:
-                cb = datetime.fromtimestamp(created_before)
-            else:
-                cb = datetime.now()
-            info = self._download_json(self.MOMENTS_URL_FORMAT % (channel_id, created_before), username, note='Downloading moments data (created before %s)' % (cb))
-
-            for item in info['items']:
-                if item['type'] == 'moment':
-                    entry = _moment_to_entry(item)
-                    moment_ids_processed.append(entry['id'])
-                    entries.append(entry)
-                elif item['type'] == 'collection':
-                    moment_ids += [compat_str(x) for x in item['momentsIds']]
-
-                try:
-                    created_before = int_or_none(item['created'])
-                except KeyError:
-                    err = True
-                    break
-
-            if (err or
-                    not info['hasMore'] or
-                    'items' not in info or
-                    not info['items']):
+        for page_num in itertools.count(1):
+            if created_before is None:
                 break
+            info = self._download_json(
+                '%s/moment/profile/channelId=%s/createdBefore=%d/records=20'
+                % (CDN_API_BASE, channel_id, created_before), username,
+                note='Downloading moments page %d' % page_num)
+            items = info.get('items')
+            if not items or not isinstance(items, list):
+                break
+            for item in items:
+                if not isinstance(item, dict):
+                    continue
+                item_type = item.get('type')
+                if item_type == 'moment':
+                    entry = _extract_moment(item, fatal=False)
+                    if entry:
+                        yield entry
+                elif item_type == 'collection':
+                    moments = item.get('momentsIds')
+                    if isinstance(moments, list):
+                        for moment_id in moments:
+                            m = self._download_json(
+                                MOMENT_URL_FORMAT % moment_id, username,
+                                note='Downloading %s moment JSON' % moment_id,
+                                fatal=False)
+                            if m and isinstance(m, dict) and m.get('item'):
+                                entry = _extract_moment(m['item'])
+                                if entry:
+                                    yield entry
+                created_before = int_or_none(item.get('created'))
 
-        for mid in set(moment_ids):
-            if mid in moment_ids_processed:
-                continue
-            item = self._download_json(MOMENT_URL_FORMAT % (mid), mid)
-            entries.append(_moment_to_entry(item['item']))
-
-        return self.playlist_result(entries, playlist_title='%s moments' % (username))
+    def _real_extract(self, url):
+        username = self._match_id(url)
+        channel_id = compat_str(self._download_json(
+            'https://api.younow.com/php/api/broadcast/info/curId=0/user=%s'
+            % username, username, note='Downloading user information')['userId'])
+        return self.playlist_result(
+            self._entries(username, channel_id), channel_id,
+            '%s moments' % username)
 
 
 class YouNowMomentIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?younow\.com/[^/]+/(?P<id>[^/]+)/[^/]+'
+    _VALID_URL = r'https?://(?:www\.)?younow\.com/[^/]+/(?P<id>[^/?#&]+)'
     _TEST = {
         'url': 'https://www.younow.com/GABO.../20712117/36319236/3b316doc/m',
+        'md5': 'a30c70eadb9fb39a1aa3c8c0d22a0807',
         'info_dict': {
             'id': '20712117',
             'ext': 'mp4',
             'title': 'YouNow capture',
-            'view_count': 19,
-            'like_count': 0,
+            'view_count': int,
+            'like_count': int,
             'timestamp': 1490432040,
-            'formats': [{
-                'url': 'https://hls.younow.com/momentsplaylists/live/20712117/20712117.m3u8',
-                'ext': 'mp4',
-                'protocol': 'm3u8',
-            }],
             'upload_date': '20170325',
             'uploader': 'GABO...',
             'uploader_id': 35917228,
         },
     }
 
+    @classmethod
+    def suitable(cls, url):
+        return (False
+                if YouNowChannelIE.suitable(url)
+                else super(YouNowMomentIE, cls).suitable(url))
+
     def _real_extract(self, url):
-        mid = self._match_id(url)
-        item = self._download_json(MOMENT_URL_FORMAT % (mid), mid)
-        return _moment_to_entry(item['item'])
+        video_id = self._match_id(url)
+        item = self._download_json(MOMENT_URL_FORMAT % video_id, video_id)
+        return _extract_moment(item['item'])

	youtube-dl Another place where youtube-dl lives on
	git clone git://git.oshgnacknak.de/youtube-dl.git
	Log \| Files \| Refs \| README \| LICENSE

M	youtube_dl/extractor/extractors.py	\|	2	+-
M	youtube_dl/extractor/younow.py	\|	225	++++++++++++++++++++++++++++++++++++++++---------------------------------------