archiveorg.py (3712B)
1 from __future__ import unicode_literals 2 3 from .common import InfoExtractor 4 from ..utils import ( 5 clean_html, 6 extract_attributes, 7 unified_strdate, 8 unified_timestamp, 9 ) 10 11 12 class ArchiveOrgIE(InfoExtractor): 13 IE_NAME = 'archive.org' 14 IE_DESC = 'archive.org videos' 15 _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#&]+)' 16 _TESTS = [{ 17 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 18 'md5': '8af1d4cf447933ed3c7f4871162602db', 19 'info_dict': { 20 'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect', 21 'ext': 'ogg', 22 'title': '1968 Demo - FJCC Conference Presentation Reel #1', 23 'description': 'md5:da45c349df039f1cc8075268eb1b5c25', 24 'creator': 'SRI International', 25 'release_date': '19681210', 26 'uploader': 'SRI International', 27 'timestamp': 1268695290, 28 'upload_date': '20100315', 29 } 30 }, { 31 'url': 'https://archive.org/details/Cops1922', 32 'md5': '0869000b4ce265e8ca62738b336b268a', 33 'info_dict': { 34 'id': 'Cops1922', 35 'ext': 'mp4', 36 'title': 'Buster Keaton\'s "Cops" (1922)', 37 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c', 38 'timestamp': 1387699629, 39 'upload_date': '20131222', 40 } 41 }, { 42 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 43 'only_matching': True, 44 }, { 45 'url': 'https://archive.org/details/MSNBCW_20131125_040000_To_Catch_a_Predator/', 46 'only_matching': True, 47 }] 48 49 def _real_extract(self, url): 50 video_id = self._match_id(url) 51 webpage = self._download_webpage( 52 'http://archive.org/embed/' + video_id, video_id) 53 54 playlist = None 55 play8 = self._search_regex( 56 r'(<[^>]+\bclass=["\']js-play8-playlist[^>]+>)', webpage, 57 'playlist', default=None) 58 if play8: 59 attrs = extract_attributes(play8) 60 playlist = attrs.get('value') 61 if not playlist: 62 # Old jwplayer fallback 63 playlist = self._search_regex( 64 r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", 65 webpage, 'jwplayer playlist', default='[]') 66 jwplayer_playlist = self._parse_json(playlist, video_id, fatal=False) 67 if jwplayer_playlist: 68 info = self._parse_jwplayer_data( 69 {'playlist': jwplayer_playlist}, video_id, base_url=url) 70 else: 71 # HTML5 media fallback 72 info = self._parse_html5_media_entries(url, webpage, video_id)[0] 73 info['id'] = video_id 74 75 def get_optional(metadata, field): 76 return metadata.get(field, [None])[0] 77 78 metadata = self._download_json( 79 'http://archive.org/details/' + video_id, video_id, query={ 80 'output': 'json', 81 })['metadata'] 82 info.update({ 83 'title': get_optional(metadata, 'title') or info.get('title'), 84 'description': clean_html(get_optional(metadata, 'description')), 85 }) 86 if info.get('_type') != 'playlist': 87 creator = get_optional(metadata, 'creator') 88 info.update({ 89 'creator': creator, 90 'release_date': unified_strdate(get_optional(metadata, 'date')), 91 'uploader': get_optional(metadata, 'publisher') or creator, 92 'timestamp': unified_timestamp(get_optional(metadata, 'publicdate')), 93 'language': get_optional(metadata, 'language'), 94 }) 95 return info