airmozilla.py (2697B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 6 from .common import InfoExtractor 7 from ..utils import ( 8 int_or_none, 9 parse_duration, 10 parse_iso8601, 11 ) 12 13 14 class AirMozillaIE(InfoExtractor): 15 _VALID_URL = r'https?://air\.mozilla\.org/(?P<id>[0-9a-z-]+)/?' 16 _TEST = { 17 'url': 'https://air.mozilla.org/privacy-lab-a-meetup-for-privacy-minded-people-in-san-francisco/', 18 'md5': '8d02f53ee39cf006009180e21df1f3ba', 19 'info_dict': { 20 'id': '6x4q2w', 21 'ext': 'mp4', 22 'title': 'Privacy Lab - a meetup for privacy minded people in San Francisco', 23 'thumbnail': r're:https?://.*/poster\.jpg', 24 'description': 'Brings together privacy professionals and others interested in privacy at for-profits, non-profits, and NGOs in an effort to contribute to the state of the ecosystem...', 25 'timestamp': 1422487800, 26 'upload_date': '20150128', 27 'location': 'SFO Commons', 28 'duration': 3780, 29 'view_count': int, 30 'categories': ['Main', 'Privacy'], 31 } 32 } 33 34 def _real_extract(self, url): 35 display_id = self._match_id(url) 36 webpage = self._download_webpage(url, display_id) 37 video_id = self._html_search_regex(r'//vid\.ly/(.*?)/embed', webpage, 'id') 38 39 embed_script = self._download_webpage('https://vid.ly/{0}/embed'.format(video_id), video_id) 40 jwconfig = self._parse_json(self._search_regex( 41 r'initCallback\((.*)\);', embed_script, 'metadata'), video_id)['config'] 42 43 info_dict = self._parse_jwplayer_data(jwconfig, video_id) 44 view_count = int_or_none(self._html_search_regex( 45 r'Views since archived: ([0-9]+)', 46 webpage, 'view count', fatal=False)) 47 timestamp = parse_iso8601(self._html_search_regex( 48 r'<time datetime="(.*?)"', webpage, 'timestamp', fatal=False)) 49 duration = parse_duration(self._search_regex( 50 r'Duration:\s*(\d+\s*hours?\s*\d+\s*minutes?)', 51 webpage, 'duration', fatal=False)) 52 53 info_dict.update({ 54 'id': video_id, 55 'title': self._og_search_title(webpage), 56 'url': self._og_search_url(webpage), 57 'display_id': display_id, 58 'description': self._og_search_description(webpage), 59 'timestamp': timestamp, 60 'location': self._html_search_regex(r'Location: (.*)', webpage, 'location', default=None), 61 'duration': duration, 62 'view_count': view_count, 63 'categories': re.findall(r'<a href=".*?" class="channel">(.*?)</a>', webpage), 64 }) 65 66 return info_dict