From: Sergey M․ Date: Sun, 26 Oct 2014 09:48:11 +0000 (+0700) Subject: [vrt] Improve extractor X-Git-Url: http://git.oshgnacknak.de/?a=commitdiff_plain;h=911344e5aca14d207731148f4862913827c8c871;p=youtube-dl [vrt] Improve extractor --- diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ec299b72f..5ec9b4745 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -27,7 +27,6 @@ from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE from .beeg import BeegIE from .behindkink import BehindKinkIE -from .belgiannational import BelgianNationalIE from .bild import BildIE from .bilibili import BiliBiliIE from .blinkx import BlinkxIE @@ -446,6 +445,7 @@ from .viki import VikiIE from .vk import VKIE from .vodlocker import VodlockerIE from .vporn import VpornIE +from .vrt import VRTIE from .vube import VubeIE from .vuclip import VuClipIE from .vulture import VultureIE diff --git a/youtube_dl/extractor/belgiannational.py b/youtube_dl/extractor/belgiannational.py deleted file mode 100644 index cabff50f2..000000000 --- a/youtube_dl/extractor/belgiannational.py +++ /dev/null @@ -1,63 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import int_or_none - -class BelgianNationalIE(InfoExtractor): - _VALID_URL = r'http://(?:deredactie|sporza|cobra)\.be/cm/(.*)/(?P[^\']+)' - _TESTS = [ - # deredactie.be - { - 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/programmas/journaal/EP_141025_JOL', - 'md5': '4cebde1eb60a53782d4f3992cbd46ec8', - 'info_dict': { - 'id': 'EP_141025_JOL', - 'title': 'Het journaal L - 25/10/14', - 'ext': 'mp4', - 'duration': 929, - } - }, - # sporza.be - { - 'url': 'http://sporza.be/cm/sporza/videozone/programmas/extratime/EP_141020_Extra_time', - 'md5': '11f53088da9bf8e7cfc42456697953ff', - 'info_dict': { - 'id': 'EP_141020_Extra_time', - 'title': 'Bekijk Extra Time van 20 oktober', - 'ext': 'mp4', - 'duration': 3238, - } - }, - # cobra.be - { - 'url': 'http://cobra.be/cm/cobra/videozone/rubriek/film-videozone/141022-mv-ellis-cafecorsari', - 'md5': '78a2b060a5083c4f055449a72477409d', - 'info_dict': { - 'id': '141022-mv-ellis-cafecorsari', - 'title': 'Bret Easton Ellis in Café Corsari', - 'ext': 'mp4', - 'duration': 661, - } - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') - - webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) - - video_url = self._search_regex(r'data-video-src="(.*?)"', webpage, 'Video url') + '/manifest.f4m' - duration = int_or_none(self._search_regex(r'data-video-sitestat-duration="(.*?)"', webpage, 'Duration')) - - return { - 'id': video_id, - 'title': title, - 'url': video_url, - 'ext': 'mp4', - 'duration': duration, - } \ No newline at end of file diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py new file mode 100644 index 000000000..57ef8dc30 --- /dev/null +++ b/youtube_dl/extractor/vrt.py @@ -0,0 +1,95 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import float_or_none + + +class VRTIE(InfoExtractor): + _VALID_URL = r'https?://(?:deredactie|sporza|cobra)\.be/cm/(?:[^/]+/)+(?P[^/]+)/*' + _TESTS = [ + # deredactie.be + { + 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/programmas/journaal/EP_141025_JOL', + 'md5': '4cebde1eb60a53782d4f3992cbd46ec8', + 'info_dict': { + 'id': '2129880', + 'ext': 'flv', + 'title': 'Het journaal L - 25/10/14', + 'description': None, + 'timestamp': 1414271750.949, + 'upload_date': '20141025', + 'duration': 929, + } + }, + # sporza.be + { + 'url': 'http://sporza.be/cm/sporza/videozone/programmas/extratime/EP_141020_Extra_time', + 'md5': '11f53088da9bf8e7cfc42456697953ff', + 'info_dict': { + 'id': '2124639', + 'ext': 'flv', + 'title': 'Bekijk Extra Time van 20 oktober', + 'description': 'md5:83ac5415a4f1816c6a93f8138aef2426', + 'timestamp': 1413835980.560, + 'upload_date': '20141020', + 'duration': 3238, + } + }, + # cobra.be + { + 'url': 'http://cobra.be/cm/cobra/videozone/rubriek/film-videozone/141022-mv-ellis-cafecorsari', + 'md5': '78a2b060a5083c4f055449a72477409d', + 'info_dict': { + 'id': '2126050', + 'ext': 'flv', + 'title': 'Bret Easton Ellis in Café Corsari', + 'description': 'md5:f699986e823f32fd6036c1855a724ee9', + 'timestamp': 1413967500.494, + 'upload_date': '20141022', + 'duration': 661, + } + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'data-video-id="([^"]+)_[^"]+"', webpage, 'video id', fatal=False) + + formats = [] + mobj = re.search( + r'data-video-iphone-server="(?P[^"]+)"\s+data-video-iphone-path="(?P[^"]+)"', + webpage) + if mobj: + formats.extend(self._extract_m3u8_formats( + '%s/%s' % (mobj.group('server'), mobj.group('path')), + video_id, 'mp4')) + mobj = re.search(r'data-video-src="(?P[^"]+)"', webpage) + if mobj: + formats.extend(self._extract_f4m_formats( + '%s/manifest.f4m' % mobj.group('src'), video_id)) + self._sort_formats(formats) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage, default=None) + thumbnail = self._og_search_thumbnail(webpage) + timestamp = float_or_none(self._search_regex( + r'data-video-sitestat-pubdate="(\d+)"', webpage, 'timestamp', fatal=False), 1000) + duration = float_or_none(self._search_regex( + r'data-video-duration="(\d+)"', webpage, 'duration', fatal=False), 1000) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } \ No newline at end of file