facebook.py (30183B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import json 5 import re 6 import socket 7 8 from .common import InfoExtractor 9 from ..compat import ( 10 compat_etree_fromstring, 11 compat_http_client, 12 compat_str, 13 compat_urllib_error, 14 compat_urllib_parse_unquote, 15 compat_urllib_parse_unquote_plus, 16 ) 17 from ..utils import ( 18 clean_html, 19 error_to_compat_str, 20 ExtractorError, 21 float_or_none, 22 get_element_by_id, 23 int_or_none, 24 js_to_json, 25 limit_length, 26 parse_count, 27 qualities, 28 sanitized_Request, 29 try_get, 30 urlencode_postdata, 31 urljoin, 32 ) 33 34 35 class FacebookIE(InfoExtractor): 36 _VALID_URL = r'''(?x) 37 (?: 38 https?:// 39 (?:[\w-]+\.)?(?:facebook\.com|facebookcorewwwi\.onion)/ 40 (?:[^#]*?\#!/)? 41 (?: 42 (?: 43 video/video\.php| 44 photo\.php| 45 video\.php| 46 video/embed| 47 story\.php| 48 watch(?:/live)?/? 49 )\?(?:.*?)(?:v|video_id|story_fbid)=| 50 [^/]+/videos/(?:[^/]+/)?| 51 [^/]+/posts/| 52 groups/[^/]+/permalink/| 53 watchparty/ 54 )| 55 facebook: 56 ) 57 (?P<id>[0-9]+) 58 ''' 59 _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' 60 _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' 61 _NETRC_MACHINE = 'facebook' 62 IE_NAME = 'facebook' 63 64 _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' 65 _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' 66 67 _TESTS = [{ 68 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', 69 'md5': '6a40d33c0eccbb1af76cf0485a052659', 70 'info_dict': { 71 'id': '637842556329505', 72 'ext': 'mp4', 73 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', 74 'uploader': 'Tennis on Facebook', 75 'upload_date': '20140908', 76 'timestamp': 1410199200, 77 }, 78 'skip': 'Requires logging in', 79 }, { 80 # data.video 81 'url': 'https://www.facebook.com/video.php?v=274175099429670', 82 'info_dict': { 83 'id': '274175099429670', 84 'ext': 'mp4', 85 'title': 're:^Asif Nawab Butt posted a video', 86 'uploader': 'Asif Nawab Butt', 87 'upload_date': '20140506', 88 'timestamp': 1399398998, 89 'thumbnail': r're:^https?://.*', 90 }, 91 'expected_warnings': [ 92 'title' 93 ] 94 }, { 95 'note': 'Video with DASH manifest', 96 'url': 'https://www.facebook.com/video.php?v=957955867617029', 97 'md5': 'b2c28d528273b323abe5c6ab59f0f030', 98 'info_dict': { 99 'id': '957955867617029', 100 'ext': 'mp4', 101 'title': 'When you post epic content on instagram.com/433 8 million followers, this is ...', 102 'uploader': 'Demy de Zeeuw', 103 'upload_date': '20160110', 104 'timestamp': 1452431627, 105 }, 106 'skip': 'Requires logging in', 107 }, { 108 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', 109 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6', 110 'info_dict': { 111 'id': '544765982287235', 112 'ext': 'mp4', 113 'title': '"What are you doing running in the snow?"', 114 'uploader': 'FailArmy', 115 }, 116 'skip': 'Video gone', 117 }, { 118 'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903', 119 'md5': '1deb90b6ac27f7efcf6d747c8a27f5e3', 120 'info_dict': { 121 'id': '1035862816472149', 122 'ext': 'mp4', 123 'title': 'What the Flock Is Going On In New Zealand Credit: ViralHog', 124 'uploader': 'S. Saint', 125 }, 126 'skip': 'Video gone', 127 }, { 128 'note': 'swf params escaped', 129 'url': 'https://www.facebook.com/barackobama/posts/10153664894881749', 130 'md5': '97ba073838964d12c70566e0085c2b91', 131 'info_dict': { 132 'id': '10153664894881749', 133 'ext': 'mp4', 134 'title': 'Average time to confirm recent Supreme Court nominees: 67 days Longest it\'s t...', 135 'thumbnail': r're:^https?://.*', 136 'timestamp': 1456259628, 137 'upload_date': '20160223', 138 'uploader': 'Barack Obama', 139 }, 140 }, { 141 # have 1080P, but only up to 720p in swf params 142 # data.video.story.attachments[].media 143 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', 144 'md5': '9571fae53d4165bbbadb17a94651dcdc', 145 'info_dict': { 146 'id': '10155529876156509', 147 'ext': 'mp4', 148 'title': 'She survived the holocaust — and years later, she’s getting her citizenship s...', 149 'timestamp': 1477818095, 150 'upload_date': '20161030', 151 'uploader': 'CNN', 152 'thumbnail': r're:^https?://.*', 153 'view_count': int, 154 }, 155 }, { 156 # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall 157 # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 158 'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/', 159 'info_dict': { 160 'id': '1417995061575415', 161 'ext': 'mp4', 162 'title': 'md5:1db063d6a8c13faa8da727817339c857', 163 'timestamp': 1486648217, 164 'upload_date': '20170209', 165 'uploader': 'Yaroslav Korpan', 166 }, 167 'params': { 168 'skip_download': True, 169 }, 170 }, { 171 'url': 'https://www.facebook.com/LaGuiaDelVaron/posts/1072691702860471', 172 'info_dict': { 173 'id': '1072691702860471', 174 'ext': 'mp4', 175 'title': 'md5:ae2d22a93fbb12dad20dc393a869739d', 176 'timestamp': 1477305000, 177 'upload_date': '20161024', 178 'uploader': 'La Guía Del Varón', 179 'thumbnail': r're:^https?://.*', 180 }, 181 'params': { 182 'skip_download': True, 183 }, 184 }, { 185 # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 186 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', 187 'info_dict': { 188 'id': '1396382447100162', 189 'ext': 'mp4', 190 'title': 'md5:19a428bbde91364e3de815383b54a235', 191 'timestamp': 1486035494, 192 'upload_date': '20170202', 193 'uploader': 'Elisabeth Ahtn', 194 }, 195 'params': { 196 'skip_download': True, 197 }, 198 }, { 199 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 200 'only_matching': True, 201 }, { 202 'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf', 203 'only_matching': True, 204 }, { 205 # data.mediaset.currMedia.edges 206 'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater', 207 'only_matching': True, 208 }, { 209 # data.video.story.attachments[].media 210 'url': 'facebook:544765982287235', 211 'only_matching': True, 212 }, { 213 # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 214 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', 215 'only_matching': True, 216 }, { 217 # data.video.creation_story.attachments[].media 218 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', 219 'only_matching': True, 220 }, { 221 # data.video 222 'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670', 223 'only_matching': True, 224 }, { 225 # no title 226 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', 227 'only_matching': True, 228 }, { 229 # data.video 230 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', 231 'info_dict': { 232 'id': '359649331226507', 233 'ext': 'mp4', 234 'title': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses', 235 'uploader': 'ESL One Dota 2', 236 }, 237 'params': { 238 'skip_download': True, 239 }, 240 }, { 241 # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media 242 'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/', 243 'info_dict': { 244 'id': '106560053808006', 245 }, 246 'playlist_count': 2, 247 }, { 248 # data.video.story.attachments[].media 249 'url': 'https://www.facebook.com/watch/?v=647537299265662', 250 'only_matching': True, 251 }, { 252 # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media 253 'url': 'https://www.facebook.com/PankajShahLondon/posts/10157667649866271', 254 'info_dict': { 255 'id': '10157667649866271', 256 }, 257 'playlist_count': 3, 258 }, { 259 # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media 260 'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330', 261 'info_dict': { 262 'id': '117576630041613', 263 'ext': 'mp4', 264 # TODO: title can be extracted from video page 265 'title': 'Facebook video #117576630041613', 266 'uploader_id': '189393014416438', 267 'upload_date': '20201123', 268 'timestamp': 1606162592, 269 }, 270 'skip': 'Requires logging in', 271 }, { 272 # node.comet_sections.content.story.attached_story.attachments.style_type_renderer.attachment.media 273 'url': 'https://www.facebook.com/groups/ateistiskselskab/permalink/10154930137678856/', 274 'info_dict': { 275 'id': '211567722618337', 276 'ext': 'mp4', 277 'title': 'Facebook video #211567722618337', 278 'uploader_id': '127875227654254', 279 'upload_date': '20161122', 280 'timestamp': 1479793574, 281 }, 282 }, { 283 # data.video.creation_story.attachments[].media 284 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275', 285 'only_matching': True, 286 }, { 287 'url': 'https://www.facebook.com/watchparty/211641140192478', 288 'info_dict': { 289 'id': '211641140192478', 290 }, 291 'playlist_count': 1, 292 'skip': 'Requires logging in', 293 }] 294 _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' 295 _api_config = { 296 'graphURI': '/api/graphql/' 297 } 298 299 @staticmethod 300 def _extract_urls(webpage): 301 urls = [] 302 for mobj in re.finditer( 303 r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', 304 webpage): 305 urls.append(mobj.group('url')) 306 # Facebook API embed 307 # see https://developers.facebook.com/docs/plugins/embedded-video-player 308 for mobj in re.finditer(r'''(?x)<div[^>]+ 309 class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ 310 data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage): 311 urls.append(mobj.group('url')) 312 return urls 313 314 def _login(self): 315 useremail, password = self._get_login_info() 316 if useremail is None: 317 return 318 319 login_page_req = sanitized_Request(self._LOGIN_URL) 320 self._set_cookie('facebook.com', 'locale', 'en_US') 321 login_page = self._download_webpage(login_page_req, None, 322 note='Downloading login page', 323 errnote='Unable to download login page') 324 lsd = self._search_regex( 325 r'<input type="hidden" name="lsd" value="([^"]*)"', 326 login_page, 'lsd') 327 lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd') 328 329 login_form = { 330 'email': useremail, 331 'pass': password, 332 'lsd': lsd, 333 'lgnrnd': lgnrnd, 334 'next': 'http://facebook.com/home.php', 335 'default_persistent': '0', 336 'legacy_return': '1', 337 'timezone': '-60', 338 'trynum': '1', 339 } 340 request = sanitized_Request(self._LOGIN_URL, urlencode_postdata(login_form)) 341 request.add_header('Content-Type', 'application/x-www-form-urlencoded') 342 try: 343 login_results = self._download_webpage(request, None, 344 note='Logging in', errnote='unable to fetch login page') 345 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None: 346 error = self._html_search_regex( 347 r'(?s)<div[^>]+class=(["\']).*?login_error_box.*?\1[^>]*><div[^>]*>.*?</div><div[^>]*>(?P<error>.+?)</div>', 348 login_results, 'login error', default=None, group='error') 349 if error: 350 raise ExtractorError('Unable to login: %s' % error, expected=True) 351 self._downloader.report_warning('unable to log in: bad username/password, or exceeded login rate limit (~3/min). Check credentials or wait.') 352 return 353 354 fb_dtsg = self._search_regex( 355 r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg', default=None) 356 h = self._search_regex( 357 r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h', default=None) 358 359 if not fb_dtsg or not h: 360 return 361 362 check_form = { 363 'fb_dtsg': fb_dtsg, 364 'h': h, 365 'name_action_selected': 'dont_save', 366 } 367 check_req = sanitized_Request(self._CHECKPOINT_URL, urlencode_postdata(check_form)) 368 check_req.add_header('Content-Type', 'application/x-www-form-urlencoded') 369 check_response = self._download_webpage(check_req, None, 370 note='Confirming login') 371 if re.search(r'id="checkpointSubmitButton"', check_response) is not None: 372 self._downloader.report_warning('Unable to confirm login, you have to login in your browser and authorize the login.') 373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: 374 self._downloader.report_warning('unable to log in: %s' % error_to_compat_str(err)) 375 return 376 377 def _real_initialize(self): 378 self._login() 379 380 def _extract_from_url(self, url, video_id): 381 webpage = self._download_webpage( 382 url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) 383 384 video_data = None 385 386 def extract_video_data(instances): 387 video_data = [] 388 for item in instances: 389 if try_get(item, lambda x: x[1][0]) == 'VideoConfig': 390 video_item = item[2][0] 391 if video_item.get('video_id'): 392 video_data.append(video_item['videoData']) 393 return video_data 394 395 server_js_data = self._parse_json(self._search_regex( 396 [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'], 397 webpage, 'server js data', default='{}'), video_id, fatal=False) 398 399 if server_js_data: 400 video_data = extract_video_data(server_js_data.get('instances', [])) 401 402 def extract_from_jsmods_instances(js_data): 403 if js_data: 404 return extract_video_data(try_get( 405 js_data, lambda x: x['jsmods']['instances'], list) or []) 406 407 def extract_dash_manifest(video, formats): 408 dash_manifest = video.get('dash_manifest') 409 if dash_manifest: 410 formats.extend(self._parse_mpd_formats( 411 compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) 412 413 def process_formats(formats): 414 # Downloads with browser's User-Agent are rate limited. Working around 415 # with non-browser User-Agent. 416 for f in formats: 417 f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' 418 419 self._sort_formats(formats) 420 421 def extract_relay_data(_filter): 422 return self._parse_json(self._search_regex( 423 r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter, 424 webpage, 'replay data', default='{}'), video_id, fatal=False) or {} 425 426 def extract_relay_prefetched_data(_filter): 427 replay_data = extract_relay_data(_filter) 428 for require in (replay_data.get('require') or []): 429 if require[0] == 'RelayPrefetchedStreamCache': 430 return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} 431 432 if not video_data: 433 server_js_data = self._parse_json(self._search_regex([ 434 r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, 435 r'bigPipe\.onPageletArrive\(({.*?id\s*:\s*"%s".*?})\);' % self._SUPPORTED_PAGLETS_REGEX 436 ], webpage, 'js data', default='{}'), video_id, js_to_json, False) 437 video_data = extract_from_jsmods_instances(server_js_data) 438 439 if not video_data: 440 data = extract_relay_prefetched_data( 441 r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"') 442 if data: 443 entries = [] 444 445 def parse_graphql_video(video): 446 formats = [] 447 q = qualities(['sd', 'hd']) 448 for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: 449 playable_url = video.get('playable_url' + suffix) 450 if not playable_url: 451 continue 452 formats.append({ 453 'format_id': format_id, 454 'quality': q(format_id), 455 'url': playable_url, 456 }) 457 extract_dash_manifest(video, formats) 458 process_formats(formats) 459 v_id = video.get('videoId') or video.get('id') or video_id 460 info = { 461 'id': v_id, 462 'formats': formats, 463 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), 464 'uploader_id': try_get(video, lambda x: x['owner']['id']), 465 'timestamp': int_or_none(video.get('publish_time')), 466 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), 467 } 468 description = try_get(video, lambda x: x['savable_description']['text']) 469 title = video.get('name') 470 if title: 471 info.update({ 472 'title': title, 473 'description': description, 474 }) 475 else: 476 info['title'] = description or 'Facebook video #%s' % v_id 477 entries.append(info) 478 479 def parse_attachment(attachment, key='media'): 480 media = attachment.get(key) or {} 481 if media.get('__typename') == 'Video': 482 return parse_graphql_video(media) 483 484 nodes = data.get('nodes') or [] 485 node = data.get('node') or {} 486 if not nodes and node: 487 nodes.append(node) 488 for node in nodes: 489 story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {} 490 attachments = try_get(story, [ 491 lambda x: x['attached_story']['attachments'], 492 lambda x: x['attachments'] 493 ], list) or [] 494 for attachment in attachments: 495 attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) 496 ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] 497 for n in ns: 498 parse_attachment(n) 499 parse_attachment(attachment) 500 501 edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] 502 for edge in edges: 503 parse_attachment(edge, key='node') 504 505 video = data.get('video') or {} 506 if video: 507 attachments = try_get(video, [ 508 lambda x: x['story']['attachments'], 509 lambda x: x['creation_story']['attachments'] 510 ], list) or [] 511 for attachment in attachments: 512 parse_attachment(attachment) 513 if not entries: 514 parse_graphql_video(video) 515 516 return self.playlist_result(entries, video_id) 517 518 if not video_data: 519 m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage) 520 if m_msg is not None: 521 raise ExtractorError( 522 'The video is not available, Facebook said: "%s"' % m_msg.group(1), 523 expected=True) 524 elif any(p in webpage for p in ( 525 '>You must log in to continue', 526 'id="login_form"', 527 'id="loginbutton"')): 528 self.raise_login_required() 529 530 if not video_data and '/watchparty/' in url: 531 post_data = { 532 'doc_id': 3731964053542869, 533 'variables': json.dumps({ 534 'livingRoomID': video_id, 535 }), 536 } 537 538 prefetched_data = extract_relay_prefetched_data(r'"login_data"\s*:\s*{') 539 if prefetched_data: 540 lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict) 541 if lsd: 542 post_data[lsd['name']] = lsd['value'] 543 544 relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,') 545 for define in (relay_data.get('define') or []): 546 if define[0] == 'RelayAPIConfigDefaults': 547 self._api_config = define[2] 548 549 living_room = self._download_json( 550 urljoin(url, self._api_config['graphURI']), video_id, 551 data=urlencode_postdata(post_data))['data']['living_room'] 552 553 entries = [] 554 for edge in (try_get(living_room, lambda x: x['recap']['watched_content']['edges']) or []): 555 video = try_get(edge, lambda x: x['node']['video']) or {} 556 v_id = video.get('id') 557 if not v_id: 558 continue 559 v_id = compat_str(v_id) 560 entries.append(self.url_result( 561 self._VIDEO_PAGE_TEMPLATE % v_id, 562 self.ie_key(), v_id, video.get('name'))) 563 564 return self.playlist_result(entries, video_id) 565 566 if not video_data: 567 # Video info not in first request, do a secondary request using 568 # tahoe player specific URL 569 tahoe_data = self._download_webpage( 570 self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, 571 data=urlencode_postdata({ 572 '__a': 1, 573 '__pc': self._search_regex( 574 r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, 575 'pkg cohort', default='PHASED:DEFAULT'), 576 '__rev': self._search_regex( 577 r'client_revision["\']\s*:\s*(\d+),', webpage, 578 'client revision', default='3944515'), 579 'fb_dtsg': self._search_regex( 580 r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"', 581 webpage, 'dtsg token', default=''), 582 }), 583 headers={ 584 'Content-Type': 'application/x-www-form-urlencoded', 585 }) 586 tahoe_js_data = self._parse_json( 587 self._search_regex( 588 r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, 589 'tahoe js data', default='{}'), 590 video_id, fatal=False) 591 video_data = extract_from_jsmods_instances(tahoe_js_data) 592 593 if not video_data: 594 raise ExtractorError('Cannot parse data') 595 596 if len(video_data) > 1: 597 entries = [] 598 for v in video_data: 599 video_url = v[0].get('video_url') 600 if not video_url: 601 continue 602 entries.append(self.url_result(urljoin( 603 url, video_url), self.ie_key(), v[0].get('video_id'))) 604 return self.playlist_result(entries, video_id) 605 video_data = video_data[0] 606 607 formats = [] 608 subtitles = {} 609 for f in video_data: 610 format_id = f['stream_type'] 611 if f and isinstance(f, dict): 612 f = [f] 613 if not f or not isinstance(f, list): 614 continue 615 for quality in ('sd', 'hd'): 616 for src_type in ('src', 'src_no_ratelimit'): 617 src = f[0].get('%s_%s' % (quality, src_type)) 618 if src: 619 preference = -10 if format_id == 'progressive' else 0 620 if quality == 'hd': 621 preference += 5 622 formats.append({ 623 'format_id': '%s_%s_%s' % (format_id, quality, src_type), 624 'url': src, 625 'preference': preference, 626 }) 627 extract_dash_manifest(f[0], formats) 628 subtitles_src = f[0].get('subtitles_src') 629 if subtitles_src: 630 subtitles.setdefault('en', []).append({'url': subtitles_src}) 631 if not formats: 632 raise ExtractorError('Cannot find video formats') 633 634 process_formats(formats) 635 636 video_title = self._html_search_regex( 637 r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 638 'title', default=None) 639 if not video_title: 640 video_title = self._html_search_regex( 641 r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>', 642 webpage, 'alternative title', default=None) 643 if not video_title: 644 video_title = self._html_search_meta( 645 'description', webpage, 'title', default=None) 646 if video_title: 647 video_title = limit_length(video_title, 80) 648 else: 649 video_title = 'Facebook video #%s' % video_id 650 uploader = clean_html(get_element_by_id( 651 'fbPhotoPageAuthorName', webpage)) or self._search_regex( 652 r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', 653 default=None) or self._og_search_title(webpage, fatal=False) 654 timestamp = int_or_none(self._search_regex( 655 r'<abbr[^>]+data-utime=["\'](\d+)', webpage, 656 'timestamp', default=None)) 657 thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) 658 659 view_count = parse_count(self._search_regex( 660 r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', 661 default=None)) 662 663 info_dict = { 664 'id': video_id, 665 'title': video_title, 666 'formats': formats, 667 'uploader': uploader, 668 'timestamp': timestamp, 669 'thumbnail': thumbnail, 670 'view_count': view_count, 671 'subtitles': subtitles, 672 } 673 674 return info_dict 675 676 def _real_extract(self, url): 677 video_id = self._match_id(url) 678 679 real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url 680 return self._extract_from_url(real_url, video_id) 681 682 683 class FacebookPluginsVideoIE(InfoExtractor): 684 _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/plugins/video\.php\?.*?\bhref=(?P<id>https.+)' 685 686 _TESTS = [{ 687 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fgov.sg%2Fvideos%2F10154383743583686%2F&show_text=0&width=560', 688 'md5': '5954e92cdfe51fe5782ae9bda7058a07', 689 'info_dict': { 690 'id': '10154383743583686', 691 'ext': 'mp4', 692 'title': 'What to do during the haze?', 693 'uploader': 'Gov.sg', 694 'upload_date': '20160826', 695 'timestamp': 1472184808, 696 }, 697 'add_ie': [FacebookIE.ie_key()], 698 }, { 699 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fvideo.php%3Fv%3D10204634152394104', 700 'only_matching': True, 701 }, { 702 'url': 'https://www.facebook.com/plugins/video.php?href=https://www.facebook.com/gov.sg/videos/10154383743583686/&show_text=0&width=560', 703 'only_matching': True, 704 }] 705 706 def _real_extract(self, url): 707 return self.url_result( 708 compat_urllib_parse_unquote(self._match_id(url)), 709 FacebookIE.ie_key())