tvp.py (9417B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import itertools 5 import re 6 7 from .common import InfoExtractor 8 from ..utils import ( 9 clean_html, 10 determine_ext, 11 ExtractorError, 12 get_element_by_attribute, 13 orderedSet, 14 ) 15 16 17 class TVPIE(InfoExtractor): 18 IE_NAME = 'tvp' 19 IE_DESC = 'Telewizja Polska' 20 _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)' 21 22 _TESTS = [{ 23 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536', 24 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 25 'info_dict': { 26 'id': '194536', 27 'ext': 'mp4', 28 'title': 'Czas honoru, odc. 13 – Władek', 29 'description': 'md5:437f48b93558370b031740546b696e24', 30 }, 31 }, { 32 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', 33 'md5': 'b0005b542e5b4de643a9690326ab1257', 34 'info_dict': { 35 'id': '17916176', 36 'ext': 'mp4', 37 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', 38 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', 39 }, 40 }, { 41 # page id is not the same as video id(#7799) 42 'url': 'https://wiadomosci.tvp.pl/33908820/28092017-1930', 43 'md5': '84cd3c8aec4840046e5ab712416b73d0', 44 'info_dict': { 45 'id': '33908820', 46 'ext': 'mp4', 47 'title': 'Wiadomości, 28.09.2017, 19:30', 48 'description': 'Wydanie główne codziennego serwisu informacyjnego.' 49 }, 50 'skip': 'HTTP Error 404: Not Found', 51 }, { 52 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', 53 'only_matching': True, 54 }, { 55 'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200', 56 'only_matching': True, 57 }, { 58 'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa', 59 'only_matching': True, 60 }, { 61 'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach', 62 'only_matching': True, 63 }, { 64 'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum', 65 'only_matching': True, 66 }, { 67 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji', 68 'only_matching': True, 69 }] 70 71 def _real_extract(self, url): 72 page_id = self._match_id(url) 73 webpage = self._download_webpage(url, page_id) 74 video_id = self._search_regex([ 75 r'<iframe[^>]+src="[^"]*?object_id=(\d+)', 76 r"object_id\s*:\s*'(\d+)'", 77 r'data-video-id="(\d+)"'], webpage, 'video id', default=page_id) 78 return { 79 '_type': 'url_transparent', 80 'url': 'tvp:' + video_id, 81 'description': self._og_search_description( 82 webpage, default=None) or self._html_search_meta( 83 'description', webpage, default=None), 84 'thumbnail': self._og_search_thumbnail(webpage, default=None), 85 'ie_key': 'TVPEmbed', 86 } 87 88 89 class TVPEmbedIE(InfoExtractor): 90 IE_NAME = 'tvp:embed' 91 IE_DESC = 'Telewizja Polska' 92 _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)' 93 94 _TESTS = [{ 95 'url': 'tvp:194536', 96 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 97 'info_dict': { 98 'id': '194536', 99 'ext': 'mp4', 100 'title': 'Czas honoru, odc. 13 – Władek', 101 }, 102 }, { 103 # not available 104 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', 105 'md5': '8c9cd59d16edabf39331f93bf8a766c7', 106 'info_dict': { 107 'id': '22670268', 108 'ext': 'mp4', 109 'title': 'Panorama, 07.12.2015, 15:40', 110 }, 111 'skip': 'Transmisja została zakończona lub materiał niedostępny', 112 }, { 113 'url': 'tvp:22670268', 114 'only_matching': True, 115 }] 116 117 def _real_extract(self, url): 118 video_id = self._match_id(url) 119 120 webpage = self._download_webpage( 121 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) 122 123 error = self._html_search_regex( 124 r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>', 125 webpage, 'error', default=None) or clean_html( 126 get_element_by_attribute('class', 'msg error', webpage)) 127 if error: 128 raise ExtractorError('%s said: %s' % ( 129 self.IE_NAME, clean_html(error)), expected=True) 130 131 title = self._search_regex( 132 r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1', 133 webpage, 'title', group='title') 134 series_title = self._search_regex( 135 r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1', 136 webpage, 'series', group='series', default=None) 137 if series_title: 138 title = '%s, %s' % (series_title, title) 139 140 thumbnail = self._search_regex( 141 r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None) 142 143 video_url = self._search_regex( 144 r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 145 'formats', group='url', default=None) 146 if not video_url or 'material_niedostepny.mp4' in video_url: 147 video_url = self._download_json( 148 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id, 149 video_id)['video_url'] 150 151 formats = [] 152 video_url_base = self._search_regex( 153 r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', 154 video_url, 'video base url', default=None) 155 if video_url_base: 156 # TODO: <Group> found instead of <AdaptationSet> in MPD manifest. 157 # It's not mentioned in MPEG-DASH standard. Figure that out. 158 # formats.extend(self._extract_mpd_formats( 159 # video_url_base + '.ism/video.mpd', 160 # video_id, mpd_id='dash', fatal=False)) 161 formats.extend(self._extract_ism_formats( 162 video_url_base + '.ism/Manifest', 163 video_id, 'mss', fatal=False)) 164 formats.extend(self._extract_f4m_formats( 165 video_url_base + '.ism/video.f4m', 166 video_id, f4m_id='hds', fatal=False)) 167 m3u8_formats = self._extract_m3u8_formats( 168 video_url_base + '.ism/video.m3u8', video_id, 169 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) 170 self._sort_formats(m3u8_formats) 171 m3u8_formats = list(filter( 172 lambda f: f.get('vcodec') != 'none', m3u8_formats)) 173 formats.extend(m3u8_formats) 174 for i, m3u8_format in enumerate(m3u8_formats, 2): 175 http_url = '%s-%d.mp4' % (video_url_base, i) 176 if self._is_valid_url(http_url, video_id): 177 f = m3u8_format.copy() 178 f.update({ 179 'url': http_url, 180 'format_id': f['format_id'].replace('hls', 'http'), 181 'protocol': 'http', 182 }) 183 formats.append(f) 184 else: 185 formats = [{ 186 'format_id': 'direct', 187 'url': video_url, 188 'ext': determine_ext(video_url, 'mp4'), 189 }] 190 191 self._sort_formats(formats) 192 193 return { 194 'id': video_id, 195 'title': title, 196 'thumbnail': thumbnail, 197 'formats': formats, 198 } 199 200 201 class TVPWebsiteIE(InfoExtractor): 202 IE_NAME = 'tvp:series' 203 _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)' 204 205 _TESTS = [{ 206 # series 207 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video', 208 'info_dict': { 209 'id': '38678312', 210 }, 211 'playlist_count': 115, 212 }, { 213 # film 214 'url': 'https://vod.tvp.pl/website/gloria,35139666', 215 'info_dict': { 216 'id': '36637049', 217 'ext': 'mp4', 218 'title': 'Gloria, Gloria', 219 }, 220 'params': { 221 'skip_download': True, 222 }, 223 'add_ie': ['TVPEmbed'], 224 }, { 225 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312', 226 'only_matching': True, 227 }] 228 229 def _entries(self, display_id, playlist_id): 230 url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id) 231 for page_num in itertools.count(1): 232 page = self._download_webpage( 233 url, display_id, 'Downloading page %d' % page_num, 234 query={'page': page_num}) 235 236 video_ids = orderedSet(re.findall( 237 r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id, 238 page)) 239 240 if not video_ids: 241 break 242 243 for video_id in video_ids: 244 yield self.url_result( 245 'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(), 246 video_id=video_id) 247 248 def _real_extract(self, url): 249 mobj = re.match(self._VALID_URL, url) 250 display_id, playlist_id = mobj.group('display_id', 'id') 251 return self.playlist_result( 252 self._entries(display_id, playlist_id), playlist_id)