cracked.py (3138B)
1 from __future__ import unicode_literals 2 3 import re 4 5 from .common import InfoExtractor 6 from .youtube import YoutubeIE 7 from ..utils import ( 8 parse_iso8601, 9 str_to_int, 10 ) 11 12 13 class CrackedIE(InfoExtractor): 14 _VALID_URL = r'https?://(?:www\.)?cracked\.com/video_(?P<id>\d+)_[\da-z-]+\.html' 15 _TESTS = [{ 16 'url': 'http://www.cracked.com/video_19070_if-animal-actors-got-e21-true-hollywood-stories.html', 17 'md5': '89b90b9824e3806ca95072c4d78f13f7', 18 'info_dict': { 19 'id': '19070', 20 'ext': 'mp4', 21 'title': 'If Animal Actors Got E! True Hollywood Stories', 22 'timestamp': 1404954000, 23 'upload_date': '20140710', 24 } 25 }, { 26 # youtube embed 27 'url': 'http://www.cracked.com/video_19006_4-plot-holes-you-didnt-notice-in-your-favorite-movies.html', 28 'md5': 'ccd52866b50bde63a6ef3b35016ba8c7', 29 'info_dict': { 30 'id': 'EjI00A3rZD0', 31 'ext': 'mp4', 32 'title': "4 Plot Holes You Didn't Notice in Your Favorite Movies - The Spit Take", 33 'description': 'md5:c603708c718b796fe6079e2b3351ffc7', 34 'upload_date': '20140725', 35 'uploader_id': 'Cracked', 36 'uploader': 'Cracked', 37 } 38 }] 39 40 def _real_extract(self, url): 41 video_id = self._match_id(url) 42 43 webpage = self._download_webpage(url, video_id) 44 45 youtube_url = YoutubeIE._extract_url(webpage) 46 if youtube_url: 47 return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) 48 49 video_url = self._html_search_regex( 50 [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'], 51 webpage, 'video URL') 52 53 title = self._search_regex( 54 [r'property="?og:title"?\s+content="([^"]+)"', r'class="?title"?>([^<]+)'], 55 webpage, 'title') 56 57 description = self._search_regex( 58 r'name="?(?:og:)?description"?\s+content="([^"]+)"', 59 webpage, 'description', default=None) 60 61 timestamp = self._html_search_regex( 62 r'"date"\s*:\s*"([^"]+)"', webpage, 'upload date', fatal=False) 63 if timestamp: 64 timestamp = parse_iso8601(timestamp[:-6]) 65 66 view_count = str_to_int(self._html_search_regex( 67 r'<span\s+class="?views"? id="?viewCounts"?>([\d,\.]+) Views</span>', 68 webpage, 'view count', fatal=False)) 69 comment_count = str_to_int(self._html_search_regex( 70 r'<span\s+id="?commentCounts"?>([\d,\.]+)</span>', 71 webpage, 'comment count', fatal=False)) 72 73 m = re.search(r'_(?P<width>\d+)X(?P<height>\d+)\.mp4$', video_url) 74 if m: 75 width = int(m.group('width')) 76 height = int(m.group('height')) 77 else: 78 width = height = None 79 80 return { 81 'id': video_id, 82 'url': video_url, 83 'title': title, 84 'description': description, 85 'timestamp': timestamp, 86 'view_count': view_count, 87 'comment_count': comment_count, 88 'height': height, 89 'width': width, 90 }