vier.py (10008B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 import itertools 6 7 from .common import InfoExtractor 8 from ..utils import ( 9 urlencode_postdata, 10 int_or_none, 11 unified_strdate, 12 ) 13 14 15 class VierIE(InfoExtractor): 16 IE_NAME = 'vier' 17 IE_DESC = 'vier.be and vijf.be' 18 _VALID_URL = r'''(?x) 19 https?:// 20 (?:www\.)?(?P<site>vier|vijf)\.be/ 21 (?: 22 (?: 23 [^/]+/videos| 24 video(?:/[^/]+)* 25 )/ 26 (?P<display_id>[^/]+)(?:/(?P<id>\d+))?| 27 (?: 28 video/v3/embed| 29 embed/video/public 30 )/(?P<embed_id>\d+) 31 ) 32 ''' 33 _NETRC_MACHINE = 'vier' 34 _TESTS = [{ 35 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', 36 'md5': 'e4ae2054a6b040ef1e289e20d111b46e', 37 'info_dict': { 38 'id': '16129', 39 'display_id': 'het-wordt-warm-de-moestuin', 40 'ext': 'mp4', 41 'title': 'Het wordt warm in De Moestuin', 42 'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...', 43 'upload_date': '20121025', 44 'series': 'Plan B', 45 'tags': ['De Moestuin', 'Moestuin', 'meisjes', 'Tomaat', 'Wim', 'Droom'], 46 }, 47 }, { 48 'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614', 49 'info_dict': { 50 'id': '2561614', 51 'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas', 52 'ext': 'mp4', 53 'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7', 54 'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe', 55 'upload_date': '20170228', 56 'series': 'Temptation Island', 57 'tags': list, 58 }, 59 'params': { 60 'skip_download': True, 61 }, 62 }, { 63 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', 64 'info_dict': { 65 'id': '2674839', 66 'display_id': 'jani-gaat-naar-tokio-aflevering-4', 67 'ext': 'mp4', 68 'title': 'Jani gaat naar Tokio - Aflevering 4', 69 'description': 'md5:aa8d611541db6ae9e863125704511f88', 70 'upload_date': '20170501', 71 'series': 'Jani gaat', 72 'episode_number': 4, 73 'tags': ['Jani Gaat', 'Volledige Aflevering'], 74 }, 75 'params': { 76 'skip_download': True, 77 }, 78 'skip': 'Requires account credentials', 79 }, { 80 # Requires account credentials but bypassed extraction via v3/embed page 81 # without metadata 82 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', 83 'info_dict': { 84 'id': '2674839', 85 'display_id': 'jani-gaat-naar-tokio-aflevering-4', 86 'ext': 'mp4', 87 'title': 'jani-gaat-naar-tokio-aflevering-4', 88 }, 89 'params': { 90 'skip_download': True, 91 }, 92 'expected_warnings': ['Log in to extract metadata'], 93 }, { 94 # Without video id in URL 95 'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b', 96 'only_matching': True, 97 }, { 98 'url': 'http://www.vier.be/video/v3/embed/16129', 99 'only_matching': True, 100 }, { 101 'url': 'https://www.vijf.be/embed/video/public/4093', 102 'only_matching': True, 103 }, { 104 'url': 'https://www.vier.be/video/blockbusters/in-juli-en-augustus-summer-classics', 105 'only_matching': True, 106 }, { 107 'url': 'https://www.vier.be/video/achter-de-rug/2017/achter-de-rug-seizoen-1-aflevering-6', 108 'only_matching': True, 109 }] 110 111 def _real_initialize(self): 112 self._logged_in = False 113 114 def _login(self, site): 115 username, password = self._get_login_info() 116 if username is None or password is None: 117 return 118 119 login_page = self._download_webpage( 120 'http://www.%s.be/user/login' % site, 121 None, note='Logging in', errnote='Unable to log in', 122 data=urlencode_postdata({ 123 'form_id': 'user_login', 124 'name': username, 125 'pass': password, 126 }), 127 headers={'Content-Type': 'application/x-www-form-urlencoded'}) 128 129 login_error = self._html_search_regex( 130 r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<', 131 login_page, 'login error', default=None) 132 if login_error: 133 self.report_warning('Unable to log in: %s' % login_error) 134 else: 135 self._logged_in = True 136 137 def _real_extract(self, url): 138 mobj = re.match(self._VALID_URL, url) 139 embed_id = mobj.group('embed_id') 140 display_id = mobj.group('display_id') or embed_id 141 video_id = mobj.group('id') or embed_id 142 site = mobj.group('site') 143 144 if not self._logged_in: 145 self._login(site) 146 147 webpage = self._download_webpage(url, display_id) 148 149 if r'id="user-login"' in webpage: 150 self.report_warning( 151 'Log in to extract metadata', video_id=display_id) 152 webpage = self._download_webpage( 153 'http://www.%s.be/video/v3/embed/%s' % (site, video_id), 154 display_id) 155 156 video_id = self._search_regex( 157 [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'], 158 webpage, 'video id', default=video_id or display_id) 159 160 playlist_url = self._search_regex( 161 r'data-file=(["\'])(?P<url>(?:https?:)?//[^/]+/.+?\.m3u8.*?)\1', 162 webpage, 'm3u8 url', default=None, group='url') 163 164 if not playlist_url: 165 application = self._search_regex( 166 [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], 167 webpage, 'application', default=site + '_vod') 168 filename = self._search_regex( 169 [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], 170 webpage, 'filename') 171 playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename) 172 173 formats = self._extract_wowza_formats( 174 playlist_url, display_id, skip_protocols=['dash']) 175 self._sort_formats(formats) 176 177 title = self._og_search_title(webpage, default=display_id) 178 description = self._html_search_regex( 179 r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-type-text-with-summary\b[^>]*?\1[^>]*>.*?<p>(?P<value>.+?)</p>', 180 webpage, 'description', default=None, group='value') 181 thumbnail = self._og_search_thumbnail(webpage, default=None) 182 upload_date = unified_strdate(self._html_search_regex( 183 r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-name-post-date\b[^>]*?\1[^>]*>.*?(?P<value>\d{2}/\d{2}/\d{4})', 184 webpage, 'upload date', default=None, group='value')) 185 186 series = self._search_regex( 187 r'data-program=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 188 'series', default=None, group='value') 189 episode_number = int_or_none(self._search_regex( 190 r'(?i)aflevering (\d+)', title, 'episode number', default=None)) 191 tags = re.findall(r'<a\b[^>]+\bhref=["\']/tags/[^>]+>([^<]+)<', webpage) 192 193 return { 194 'id': video_id, 195 'display_id': display_id, 196 'title': title, 197 'description': description, 198 'thumbnail': thumbnail, 199 'upload_date': upload_date, 200 'series': series, 201 'episode_number': episode_number, 202 'tags': tags, 203 'formats': formats, 204 } 205 206 207 class VierVideosIE(InfoExtractor): 208 IE_NAME = 'vier:videos' 209 _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)' 210 _TESTS = [{ 211 'url': 'http://www.vier.be/demoestuin/videos', 212 'info_dict': { 213 'id': 'demoestuin', 214 }, 215 'playlist_mincount': 153, 216 }, { 217 'url': 'http://www.vijf.be/temptationisland/videos', 218 'info_dict': { 219 'id': 'temptationisland', 220 }, 221 'playlist_mincount': 159, 222 }, { 223 'url': 'http://www.vier.be/demoestuin/videos?page=6', 224 'info_dict': { 225 'id': 'demoestuin-page6', 226 }, 227 'playlist_mincount': 20, 228 }, { 229 'url': 'http://www.vier.be/demoestuin/videos?page=7', 230 'info_dict': { 231 'id': 'demoestuin-page7', 232 }, 233 'playlist_mincount': 13, 234 }] 235 236 def _real_extract(self, url): 237 mobj = re.match(self._VALID_URL, url) 238 program = mobj.group('program') 239 site = mobj.group('site') 240 241 page_id = mobj.group('page') 242 if page_id: 243 page_id = int(page_id) 244 start_page = page_id 245 playlist_id = '%s-page%d' % (program, page_id) 246 else: 247 start_page = 0 248 playlist_id = program 249 250 entries = [] 251 for current_page_id in itertools.count(start_page): 252 current_page = self._download_webpage( 253 'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id), 254 program, 255 'Downloading page %d' % (current_page_id + 1)) 256 page_entries = [ 257 self.url_result('http://www.' + site + '.be' + video_url, 'Vier') 258 for video_url in re.findall( 259 r'<h[23]><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)] 260 entries.extend(page_entries) 261 if page_id or '>Meer<' not in current_page: 262 break 263 264 return self.playlist_result(entries, playlist_id)