3 from __future__ import unicode_literals
9 from .common import InfoExtractor
10 from .youtube import YoutubeIE
11 from ..compat import (
12 compat_etree_fromstring,
13 compat_urllib_parse_unquote,
15 compat_xml_parse_error,
32 from .brightcove import (
36 from .nbc import NBCSportsVPlayerIE
37 from .ooyala import OoyalaIE
38 from .rutv import RUTVIE
39 from .tvc import TVCIE
40 from .sportbox import SportBoxEmbedIE
41 from .smotri import SmotriIE
42 from .myvi import MyviIE
43 from .condenast import CondeNastIE
44 from .udn import UDNEmbedIE
45 from .senateisvp import SenateISVPIE
46 from .svt import SVTIE
47 from .pornhub import PornHubIE
48 from .xhamster import XHamsterEmbedIE
49 from .tnaflix import TNAFlixNetworkEmbedIE
50 from .drtuber import DrTuberIE
51 from .redtube import RedTubeIE
52 from .vimeo import VimeoIE
53 from .dailymotion import (
57 from .onionstudios import OnionStudiosIE
58 from .viewlift import ViewLiftEmbedIE
59 from .mtv import MTVServicesEmbeddedIE
60 from .pladform import PladformIE
61 from .videomore import VideomoreIE
62 from .webcaster import WebcasterFeedIE
63 from .googledrive import GoogleDriveIE
64 from .jwplatform import JWPlatformIE
65 from .digiteka import DigitekaIE
66 from .arkena import ArkenaIE
67 from .instagram import InstagramIE
68 from .liveleak import LiveLeakIE
69 from .threeqsdn import ThreeQSDNIE
70 from .theplatform import ThePlatformIE
71 from .vessel import VesselIE
72 from .kaltura import KalturaIE
73 from .eagleplatform import EaglePlatformIE
74 from .facebook import FacebookIE
75 from .soundcloud import SoundcloudIE
76 from .tunein import TuneInBaseIE
77 from .vbox7 import Vbox7IE
78 from .dbtv import DBTVIE
79 from .piksel import PikselIE
80 from .videa import VideaIE
81 from .twentymin import TwentyMinutenIE
84 class GenericIE(InfoExtractor):
85 IE_DESC = 'Generic downloader that works on some sites'
89 # Direct link to a video
91 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
92 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
97 'upload_date': '20100513',
100 # Direct link to media delivered compressed (until Accept-Encoding is *)
102 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
103 'md5': '128c42e68b13950268b648275386fc74',
105 'id': 'FictionJunction-Parallel_Hearts',
107 'title': 'FictionJunction-Parallel_Hearts',
108 'upload_date': '20140522',
110 'expected_warnings': [
111 'URL could be a direct video link, returning it as such.'
113 'skip': 'URL invalid',
115 # Direct download with broken HEAD
117 'url': 'http://ai-radio.org:8000/radio.opus',
124 'skip_download': True, # infinite live stream
126 'expected_warnings': [
127 r'501.*Not Implemented',
131 # Direct link with incorrect MIME type
133 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
134 'md5': '4ccbebe5f36706d85221f204d7eb5913',
136 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
137 'id': '5_Lennart_Poettering_-_Systemd',
139 'title': '5_Lennart_Poettering_-_Systemd',
140 'upload_date': '20141120',
142 'expected_warnings': [
143 'URL could be a direct video link, returning it as such.'
148 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
150 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
151 'title': 'Zero Punctuation',
152 'description': 're:.*groundbreaking video review series.*'
154 'playlist_mincount': 11,
156 # RSS feed with enclosure
158 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
160 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
162 'upload_date': '20150228',
163 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
166 # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng
168 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml',
172 'title': 'Automatics, robotics and biocybernetics',
173 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
174 'upload_date': '20130627',
175 'formats': 'mincount:16',
176 'subtitles': 'mincount:1',
179 'force_generic_extractor': True,
180 'skip_download': True,
183 # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html
185 'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil',
190 'formats': 'mincount:1',
193 'skip_download': True,
196 # SMIL from https://www.restudy.dk/video/play/id/1637
198 'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml',
202 'title': 'video_1637',
203 'formats': 'mincount:3',
206 'skip_download': True,
209 # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm
211 'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil',
213 'id': 'smil-service',
215 'title': 'smil-service',
216 'formats': 'mincount:1',
219 'skip_download': True,
222 # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370
224 'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil',
228 'title': '571de1fd-47bc-48db-abf9-238872a58d1f',
229 'formats': 'mincount:3',
232 'skip_download': True,
235 # XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html
237 'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf',
239 'id': 'mZlp2ctYIUEB',
241 'title': 'Tikibad ontruimd wegens brand',
242 'description': 'md5:05ca046ff47b931f9b04855015e163a4',
243 'thumbnail': r're:^https?://.*\.jpg$',
247 'skip_download': True,
250 # MPD from http://dash-mse-test.appspot.com/media.html
252 'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd',
253 'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53',
255 'id': 'car-20120827-manifest',
257 'title': 'car-20120827-manifest',
258 'formats': 'mincount:9',
259 'upload_date': '20130904',
262 'format': 'bestvideo',
265 # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8
267 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8',
272 'formats': 'mincount:8',
276 'skip_download': True,
278 'skip': 'video gone',
280 # m3u8 served with Content-Type: text/plain
282 'url': 'http://www.nacentapps.com/m3u8/index.m3u8',
287 'upload_date': '20140720',
288 'formats': 'mincount:11',
292 'skip_download': True,
294 'skip': 'video gone',
298 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
302 'upload_date': '20130224',
303 'uploader_id': 'TheVerge',
304 'description': r're:^Chris Ziegler takes a look at the\.*',
305 'uploader': 'The Verge',
306 'title': 'First Firefox OS phones side-by-side',
309 'skip_download': False,
313 # redirect in Refresh HTTP header
314 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1',
318 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set',
319 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5',
320 'upload_date': '20150917',
321 'uploader_id': 'brtvofficial',
322 'uploader': 'Boiler Room',
325 'skip_download': False,
329 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
330 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
332 'id': '13601338388002',
334 'uploader': 'www.hodiho.fr',
335 'title': 'R\u00e9gis plante sa Jeep',
338 # bandcamp page with custom domain
340 'add_ie': ['Bandcamp'],
341 'url': 'http://bronyrock.com/track/the-pony-mash',
345 'title': 'The Pony Mash',
346 'uploader': 'M_Pallante',
348 'skip': 'There is a limit of 200 free downloads / month for the test song',
351 # embedded brightcove video
352 # it also tests brightcove videos that need to set the 'Referer'
353 # in the http requests
354 'add_ie': ['BrightcoveLegacy'],
355 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
357 'id': '2765128793001',
359 'title': 'Le cours de bourse : l’analyse technique',
360 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
361 'uploader': 'BFM BUSINESS',
364 'skip_download': True,
368 # embedded with itemprop embedURL and video id spelled as `idVideo`
369 'add_id': ['BrightcoveLegacy'],
370 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/',
372 'id': '5255628253001',
374 'title': 'md5:37c519b1128915607601e75a87995fc0',
375 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26',
376 'uploader': 'BFM BUSINESS',
377 'uploader_id': '876450612001',
378 'timestamp': 1482255315,
379 'upload_date': '20161220',
382 'skip_download': True,
386 # https://github.com/rg3/youtube-dl/issues/2253
387 'url': 'http://bcove.me/i6nfkrc3',
388 'md5': '0ba9446db037002366bab3b3eb30c88c',
390 'id': '3101154703001',
392 'title': 'Still no power',
393 'uploader': 'thestar.com',
394 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
396 'add_ie': ['BrightcoveLegacy'],
397 'skip': 'video gone',
400 'url': 'http://www.championat.com/video/football/v/87/87499.html',
401 'md5': 'fb973ecf6e4a78a67453647444222983',
403 'id': '3414141473001',
405 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
406 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
407 'uploader': 'Championat',
411 # https://github.com/rg3/youtube-dl/issues/3541
412 'add_ie': ['BrightcoveLegacy'],
413 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
415 'id': '3866516442001',
417 'title': 'Leer mij vrouwen kennen: Aflevering 1',
418 'description': 'Leer mij vrouwen kennen: Aflevering 1',
419 'uploader': 'SBS Broadcasting',
421 'skip': 'Restricted to Netherlands',
423 'skip_download': True, # m3u8 download
427 # Brightcove with alternative playerID key
428 'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html',
430 'id': 'nmeth.2062_SV1',
431 'title': 'Simultaneous multiview imaging of the Drosophila syncytial blastoderm : Quantitative high-speed imaging of entire developing embryos with simultaneous multiview light-sheet microscopy : Nature Methods : Nature Research',
435 'id': '2228375078001',
437 'title': 'nmeth.2062-sv1',
438 'description': 'nmeth.2062-sv1',
439 'timestamp': 1363357591,
440 'upload_date': '20130315',
441 'uploader': 'Nature Publishing Group',
442 'uploader_id': '1964492299001',
448 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
449 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
451 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
453 'title': '2cc213299525360.mov', # that's what we get
456 'add_ie': ['Ooyala'],
459 # ooyala video embedded with http://player.ooyala.com/iframe.js
460 'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/',
462 'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB',
464 'title': '"Steve Jobs: Man in the Machine" trailer',
465 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."',
469 'skip_download': True,
471 'skip': 'movie expired',
475 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
479 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
480 'upload_date': '20140225',
481 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
482 'uploader': 'Tested',
483 'uploader_id': 'testedcom',
485 # No need to test YoutubeIE here
487 'skip_download': True,
492 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
496 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
497 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
499 # HEAD requests lead to endless 301, while GET is OK
500 'expected_warnings': ['301'],
504 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
508 'title': 'Охотское море стало целиком российским',
509 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
513 'skip_download': True,
518 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/',
522 'title': 'Дошкольное воспитание',
527 'url': 'http://www.vestifinance.ru/articles/25753',
530 'title': 'Прямые трансляции с Форума-выставки "Госзаказ-2013"',
535 'title': 'Госзаказ. День 3',
541 'title': 'Госзаказ. День 2',
547 'title': 'Госзаказ. День 1',
553 'skip_download': True,
558 'url': 'http://www.kinomyvi.tv/news/detail/Pervij-dublirovannij-trejler--Uzhastikov-_nOw1',
560 'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e',
562 'title': 'Ужастики, русский трейлер (2015)',
563 'thumbnail': r're:^https?://.*\.jpg$',
569 'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8',
572 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )',
574 'playlist_mincount': 7,
575 # This forum does not allow <iframe> syntaxes anymore
576 # Now HTML tags are displayed as-is
577 'skip': 'No videos on this page',
581 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
582 'md5': '65fdff94098e4a607385a60c5177c638',
586 'title': 'Hidden miracles of the natural world',
587 'uploader': 'Louie Schwartzberg',
588 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
591 # Embedded Ustream video
593 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
594 'md5': '27b99cdb639c9b12a79bca876a073417',
598 'uploader': 'AU SPA: The NSA and Privacy',
599 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
602 # nowvideo embed hidden behind percent encoding
604 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
605 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
607 'id': '06e53103ca9aa',
609 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
610 'description': 'No description',
615 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
616 'md5': '7653032cbb25bf6c80d80f217055fa43',
618 'id': '048195-004_PLUS7-F',
621 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
622 'upload_date': '20140320',
625 'skip_download': 'Requires rtmpdump'
627 'skip': 'video gone',
631 'url': 'http://www.tsprod.com/replay-du-concert-alcaline-de-calogero',
635 'title': 'Alcaline, le concert avec Calogero',
636 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff',
637 'upload_date': '20150226',
638 'timestamp': 1424989860,
643 'skip_download': True,
645 'expected_warnings': [
651 'url': 'http://www.wired.com/2014/04/honda-asimo/',
652 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
654 'id': '53501be369702d3275860000',
656 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
661 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
662 'md5': '441aeeb82eb72c422c7f14ec533999cd',
664 'id': 'k2mm4bCdJ6CQ2i7c8o2',
666 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
667 'description': 'md5:faf028e48a461b8b7fad38f1e104b119',
669 'uploader_id': 'xgditw',
670 'upload_date': '20140425',
671 'timestamp': 1398441542,
673 'add_ie': ['Dailymotion'],
677 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
681 'title': 'The NBL Auction 2014',
682 'uploader': 'BADMINTON England',
683 'uploader_id': 'BADMINTONEvents',
684 'upload_date': '20140603',
685 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
687 'add_ie': ['Youtube'],
689 'skip_download': True,
694 'url': 'http://www.vulture.com/2016/06/new-key-peele-sketches-released.html',
695 'md5': 'ca1aef97695ef2c1d6973256a57e5252',
697 'id': '769f7ec0-0692-4d62-9b45-0d88074bffc1',
699 'title': 'Key and Peele|October 10, 2012|2|203|Liam Neesons - Uncensored',
700 'description': 'Two valets share their love for movie star Liam Neesons.',
701 'timestamp': 1349922600,
702 'upload_date': '20121011',
705 # YouTube embed via <data-embed-url="">
707 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
711 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
712 'uploader': 'Gameloft',
713 'uploader_id': 'gameloft',
714 'upload_date': '20140828',
715 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
718 'skip_download': True,
723 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
725 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
727 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
728 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
733 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
735 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
736 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
742 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
747 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
748 'md5': '9d65602bf31c6e20014319c7d07fba27',
750 'id': '5123ea6d5e5a7',
753 'uploader': 'www.handjobhub.com',
754 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
757 # Multiple brightcove videos
758 # https://github.com/rg3/youtube-dl/issues/2283
760 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
762 'id': 'always-never',
763 'title': 'Always / Never - The New Yorker',
767 'extract_flat': False,
768 'skip_download': True,
773 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
774 'md5': '96f09a37e44da40dd083e12d9a683327',
778 'title': 'Ump changes call to ball',
779 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
781 'timestamp': 1401537900,
782 'upload_date': '20140531',
783 'thumbnail': r're:^https?://.*\.jpg$',
788 'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
789 'md5': '1953f3a698ab51cfc948ed3992a0b7ff',
793 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england',
794 'description': 'a Paywall Videos video from Remilon',
796 'uploader': 'study.com',
797 'timestamp': 1459678540,
798 'upload_date': '20160403',
799 'filesize': 24687186,
803 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
804 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
808 'title': 'Conversation about Hexagonal Rails Part 1',
809 'description': 'a Martin Fowler video from ThoughtWorks',
811 'uploader': 'thoughtworks.wistia.com',
812 'timestamp': 1401832161,
813 'upload_date': '20140603',
816 # Wistia standard embed (async)
818 'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/',
822 'title': 'Drip Brennan Dunn Workshop',
823 'description': 'a JV Webinars video from getdrip-1',
825 'timestamp': 1463607249,
826 'upload_date': '20160518',
829 'skip_download': True,
834 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
838 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
839 'uploader': 'Sophos Security',
840 'title': 'Chet Chat 171 - Oct 29, 2014',
841 'upload_date': '20141029',
844 # Soundcloud multiple embeds
846 'url': 'http://www.guitarplayer.com/lessons/1014/legato-workout-one-hour-to-more-fluid-performance---tab/52809',
849 'title': 'Guitar Essentials: Legato Workout—One-Hour to Fluid Performance | TAB + AUDIO',
851 'playlist_mincount': 7,
853 # TuneIn station embed
855 'url': 'http://radiocnrv.com/promouvoir-radio-cnrv/',
860 'location': 'Paris, France',
865 'skip_download': True,
870 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
874 'upload_date': '20141112',
875 'title': 'Rosetta #CometLanding webcast HL 10',
878 # Another Livestream embed, without 'new.' in URL
880 'url': 'https://www.freespeech.org/',
884 'title': 're:^FSTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
888 'skip_download': True,
893 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
896 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
898 'playlist_mincount': 2,
902 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
906 'upload_date': '20141126',
907 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
912 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
914 'id': '730m_DandD_1901_512k',
916 'uploader': 'www.abc.net.au',
917 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
920 # embedded viddler video
922 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
926 'uploader': 'deadspin',
927 'title': 'WALL-TO-GORTAT',
928 'timestamp': 1422285291,
929 'upload_date': '20150126',
931 'add_ie': ['Viddler'],
935 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
939 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
940 'description': 'md5:601cb790edd05908957dae8aaa866465',
941 'upload_date': '20150220',
943 'skip': 'All The Daily Show URLs now redirect to http://www.cc.com/shows/',
947 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
951 'upload_date': '20150212',
952 'uploader': 'The National Archives UK',
953 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
954 'uploader_id': 'NationalArchives08',
955 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
960 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
961 'playlist_mincount': 5,
963 'id': 'aanslagen-kopenhagen',
964 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
969 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
973 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
976 # Kaltura embed (different embed code)
978 'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014',
982 'upload_date': '20150127',
983 'uploader_id': 'PremierMedia',
985 'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014',
988 # Kaltura embed protected with referrer
990 'url': 'http://www.disney.nl/disney-channel/filmpjes/achter-de-schermen#/videoId/violetta-achter-de-schermen-ruggero',
994 'title': 'Violetta - Achter De Schermen - Ruggero',
995 'description': 'Achter de schermen met Ruggero',
996 'timestamp': 1435133761,
997 'upload_date': '20150624',
998 'uploader_id': 'echojecka',
1001 # Kaltura embed with single quotes
1003 'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY',
1008 'timestamp': 1355743100,
1009 'upload_date': '20121217',
1010 'uploader_id': 'batchUser',
1012 'add_ie': ['Kaltura'],
1015 # Kaltura embedded via quoted entry_id
1016 'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures',
1020 'title': '06_matthew_brender_raj_dutt',
1021 'timestamp': 1466638791,
1022 'upload_date': '20160622',
1024 'add_ie': ['Kaltura'],
1025 'expected_warnings': [
1026 'Could not send HEAD request'
1029 'skip_download': True,
1033 # Kaltura embedded, some fileExt broken (#11480)
1034 'url': 'http://www.cornell.edu/video/nima-arkani-hamed-standard-models-of-particle-physics',
1038 'title': 'Our "Standard Models" of particle physics and cosmology',
1039 'description': 'md5:67ea74807b8c4fea92a6f38d6d323861',
1040 'timestamp': 1321158993,
1041 'upload_date': '20111113',
1042 'uploader_id': 'kps1',
1044 'add_ie': ['Kaltura'],
1046 # Eagle.Platform embed (generic URL)
1048 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
1049 # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
1053 'title': 'Навальный вышел на свободу',
1054 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
1055 'thumbnail': r're:^https?://.*\.jpg$',
1061 # ClipYou (Eagle.Platform) embed (custom URL)
1063 'url': 'http://muz-tv.ru/play/7129/',
1064 # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
1068 'title': "'O Sole Mio",
1069 'thumbnail': r're:^https?://.*\.jpg$',
1076 'url': 'http://muz-tv.ru/kinozal/view/7400/',
1080 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
1081 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
1082 'thumbnail': r're:^https?://.*\.jpg$',
1089 'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
1093 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
1094 'thumbnail': r're:^https?://.*\.png$',
1100 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
1101 'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
1105 'title': 'Facebook Creates "On This Day" | Crunch Report',
1110 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
1114 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
1119 # Crooks and Liars embed
1121 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
1125 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
1126 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
1127 'timestamp': 1428207000,
1128 'upload_date': '20150405',
1129 'uploader': 'Heather',
1132 # Crooks and Liars external embed
1134 'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
1136 'id': 'MTE3MjUtMzQ2MzA',
1138 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
1139 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
1140 'timestamp': 1265032391,
1141 'upload_date': '20100201',
1142 'uploader': 'Heather',
1145 # NBC Sports vplayer embed
1147 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
1149 'id': 'ln7x1qSThw4k',
1151 'title': "PFT Live: New leader in the 'new-look' defense",
1152 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
1153 'uploader': 'NBCU-SPORTS',
1154 'upload_date': '20140107',
1155 'timestamp': 1389118457,
1160 'url': 'http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html',
1161 'md5': '1aa589c675898ae6d37a17913cf68d66',
1163 'id': '701714499682',
1165 'title': 'PREVIEW: On Assignment: David Letterman',
1166 'description': 'A preview of Tom Brokaw\'s interview with David Letterman as part of the On Assignment series powered by Dateline. Airs Sunday June 12 at 7/6c.',
1171 'url': 'https://video.udn.com/news/300346',
1172 'md5': 'fd2060e988c326991037b9aff9df21a6',
1176 'title': '中一中男師變性 全校師生力挺',
1177 'thumbnail': r're:^https?://.*\.jpg$',
1181 'skip_download': True,
1186 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
1188 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
1190 'description': 'VIDEO: INDEX/MATCH versus VLOOKUP.',
1191 'title': 'This is what separates the Excel masters from the wannabes',
1192 'duration': 191.933,
1196 'skip_download': True,
1199 # Brightcove URL in single quotes
1201 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
1202 'md5': '4ae374f1f8b91c889c4b9203c8c752af',
1204 'id': '4255764656001',
1206 'title': 'SN Presents: Russell Martin, World Citizen',
1207 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
1208 'uploader': 'Rogers Sportsnet',
1209 'uploader_id': '1704050871',
1210 'upload_date': '20150525',
1211 'timestamp': 1432570283,
1214 # Dailymotion Cloud video
1216 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
1217 'md5': 'dcaf23ad0c67a256f4278bce6e0bae38',
1221 'title': 'Sauvons les abeilles ! - Le débat',
1222 'description': 'md5:d9082128b1c5277987825d684939ca26',
1223 'thumbnail': r're:^https?://.*\.jpe?g$',
1224 'timestamp': 1434970506,
1225 'upload_date': '20150622',
1226 'uploader': 'Public Sénat',
1227 'uploader_id': 'xa9gza',
1230 # OnionStudios embed
1232 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537',
1236 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You',
1237 'thumbnail': r're:^https?://.*\.jpe?g$',
1238 'uploader': 'ClickHole',
1239 'uploader_id': 'clickhole',
1244 'url': 'http://whilewewatch.blogspot.ru/2012/06/whilewewatch-whilewewatch-gripping.html',
1246 'id': '74849a00-85a9-11e1-9660-123139220831',
1248 'title': '#whilewewatch',
1251 # AdobeTVVideo embed
1253 'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners',
1254 'md5': '43662b577c018ad707a63766462b1e87',
1258 'title': 'New experience with Acrobat DC',
1259 'description': 'New experience with Acrobat DC',
1260 'duration': 248.667,
1263 # BrightcoveInPageEmbed embed
1265 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/',
1267 'id': '4238694884001',
1269 'title': 'Tabletop: Dread, Last Thoughts',
1270 'description': 'Tabletop: Dread, Last Thoughts',
1274 # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions'
1275 # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm
1277 'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html',
1279 'id': '4785848093001',
1281 'title': 'The Cardinal Pell Interview',
1282 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ',
1283 'uploader': 'GlobeCast Australia - GlobeStream',
1284 'uploader_id': '2733773828001',
1285 'upload_date': '20160304',
1286 'timestamp': 1457083087,
1290 'skip_download': True,
1293 # Another form of arte.tv embed
1295 'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html',
1296 'md5': '850bfe45417ddf221288c88a0cffe2e2',
1298 'id': '030273-562_PLUS7-F',
1300 'title': 'ARTE Reportage - Nulle part, en France',
1301 'description': 'md5:e3a0e8868ed7303ed509b9e3af2b870d',
1302 'upload_date': '20160409',
1307 'url': 'http://www.wykop.pl/link/3088787/',
1308 'md5': 'ace83b9ed19b21f68e1b50e844fdf95d',
1310 'id': '874_1459135191',
1312 'title': 'Man shows poor quality of new apartment building',
1313 'description': 'The wall is like a sand pile.',
1314 'uploader': 'Lake8737',
1317 # Duplicated embedded video URLs
1319 'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443',
1321 'id': '149298443_480_16c25b74_2',
1323 'title': 'vs. Blue Orange Spring Game',
1324 'uploader': 'www.hudl.com',
1327 # twitter:player:stream embed
1329 'url': 'http://www.rtl.be/info/video/589263.aspx?CategoryID=288',
1333 'title': 'Une nouvelle espèce de dinosaure découverte en Argentine',
1334 'uploader': 'www.rtl.be',
1338 'skip_download': True,
1341 # twitter:player embed
1343 'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/',
1344 'md5': 'a3e0df96369831de324f0778e126653c',
1346 'id': '4909620399001',
1348 'title': 'What Do Black Holes Sound Like?',
1349 'description': 'what do black holes sound like',
1350 'upload_date': '20160524',
1351 'uploader_id': '29913724001',
1352 'timestamp': 1464107587,
1353 'uploader': 'TheAtlantic',
1355 'add_ie': ['BrightcoveLegacy'],
1357 # Facebook <iframe> embed
1359 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html',
1360 'md5': 'fbcde74f534176ecb015849146dd3aee',
1362 'id': '599637780109885',
1364 'title': 'Facebook video #599637780109885',
1367 # Facebook API embed
1369 'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/',
1370 'md5': 'a47372ee61b39a7b90287094d447d94e',
1372 'id': '10153467542406923',
1374 'title': 'Facebook video #10153467542406923',
1377 # Wordpress "YouTube Video Importer" plugin
1379 'url': 'http://www.lothype.com/blue-devils-drumline-stanford-lot-2016/',
1380 'md5': 'd16797741b560b485194eddda8121b48',
1382 'id': 'HNTXWDXV9Is',
1384 'title': 'Blue Devils Drumline Stanford lot 2016',
1385 'upload_date': '20160627',
1386 'uploader_id': 'GENOCIDE8GENERAL10',
1387 'uploader': 'cylus cyrus',
1391 # video stored on custom kaltura server
1392 'url': 'http://www.expansion.com/multimedia/videos.html?media=EQcM30NHIPv',
1393 'md5': '537617d06e64dfed891fa1593c4b30cc',
1397 'title': 'Elecciones británicas: 5 lecciones para Rajoy',
1398 'description': 'md5:435a89d68b9760b92ce67ed227055f16',
1399 'uploader_id': 'videos.expansion@el-mundo.net',
1400 'upload_date': '20150429',
1401 'timestamp': 1430303472,
1403 'add_ie': ['Kaltura'],
1406 # Non-standard Vimeo embed
1407 'url': 'https://openclassrooms.com/courses/understanding-the-web',
1408 'md5': '64d86f1c7d369afd9a78b38cbb88d80a',
1412 'title': 'Understanding the web - Teaser',
1413 'description': 'This is "Understanding the web - Teaser" by openclassrooms on Vimeo, the home for high quality videos and the people who love them.',
1414 'upload_date': '20151214',
1415 'uploader': 'OpenClassrooms',
1416 'uploader_id': 'openclassrooms',
1418 'add_ie': ['Vimeo'],
1421 # generic vimeo embed that requires original URL passed as Referer
1422 'url': 'http://racing4everyone.eu/2016/07/30/formula-1-2016-round12-germany/',
1423 'only_matching': True,
1426 'url': 'https://support.arkena.com/display/PLAY/Ways+to+embed+your+video',
1427 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365',
1429 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe',
1431 'title': 'Big Buck Bunny',
1432 'description': 'Royalty free test video',
1433 'timestamp': 1432816365,
1434 'upload_date': '20150528',
1438 'skip_download': True,
1440 'add_ie': [ArkenaIE.ie_key()],
1443 'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/',
1447 'title': 'НА КОСЪМ ОТ ВЗРИВ: Изтичане на газ на бензиностанция в Пловдив',
1450 'skip_download': True,
1452 'add_ie': [Vbox7IE.ie_key()],
1456 'url': 'http://www.dagbladet.no/2016/02/23/nyheter/nordlys/ski/troms/ver/43254897/',
1459 'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans',
1461 'playlist_mincount': 3,
1465 'url': 'http://forum.dvdtalk.com/movie-talk/623756-deleted-magic-star-wars-ot-deleted-alt-scenes-docu-style.html',
1467 'id': '623756-deleted-magic-star-wars-ot-deleted-alt-scenes-docu-style',
1468 'title': 'Deleted Magic - Star Wars: OT Deleted / Alt. Scenes Docu. Style - DVD Talk Forum',
1470 'playlist_mincount': 2,
1474 'url': 'http://www.20min.ch/schweiz/news/story/So-kommen-Sie-bei-Eis-und-Schnee-sicher-an-27032552',
1478 'title': 'So kommen Sie bei Eis und Schnee sicher an',
1479 'description': 'md5:117c212f64b25e3d95747e5276863f7d',
1482 'skip_download': True,
1484 'add_ie': [TwentyMinutenIE.ie_key()],
1487 # # TODO: find another test
1488 # # http://schema.org/VideoObject
1489 # 'url': 'https://flipagram.com/f/nyvTSJMKId',
1490 # 'md5': '888dcf08b7ea671381f00fab74692755',
1492 # 'id': 'nyvTSJMKId',
1494 # 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction',
1495 # 'description': '#love for cats.',
1496 # 'timestamp': 1461244995,
1497 # 'upload_date': '20160421',
1500 # 'force_generic_extractor': True,
1505 def report_following_redirect(self, new_url):
1506 """Report information extraction."""
1507 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
1509 def _extract_rss(self, url, video_id, doc):
1510 playlist_title = doc.find('./channel/title').text
1511 playlist_desc_el = doc.find('./channel/description')
1512 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
1515 for it in doc.findall('./channel/item'):
1516 next_url = xpath_text(it, 'link', fatal=False)
1518 enclosure_nodes = it.findall('./enclosure')
1519 for e in enclosure_nodes:
1520 next_url = e.attrib.get('url')
1530 'title': it.find('title').text,
1534 '_type': 'playlist',
1536 'title': playlist_title,
1537 'description': playlist_desc,
1541 def _extract_camtasia(self, url, video_id, webpage):
1542 """ Returns None if no camtasia video can be found. """
1544 camtasia_cfg = self._search_regex(
1545 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
1546 webpage, 'camtasia configuration file', default=None)
1547 if camtasia_cfg is None:
1550 title = self._html_search_meta('DC.title', webpage, fatal=True)
1552 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
1553 camtasia_cfg = self._download_xml(
1554 camtasia_url, video_id,
1555 note='Downloading camtasia configuration',
1556 errnote='Failed to download camtasia configuration')
1557 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
1560 for n in fileset_node.getchildren():
1561 url_n = n.find('./uri')
1566 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
1567 'title': '%s - %s' % (title, n.tag),
1568 'url': compat_urlparse.urljoin(url, url_n.text),
1569 'duration': float_or_none(n.find('./duration').text),
1573 '_type': 'playlist',
1578 def _real_extract(self, url):
1579 if url.startswith('//'):
1582 'url': self.http_scheme() + url,
1585 parsed_url = compat_urlparse.urlparse(url)
1586 if not parsed_url.scheme:
1587 default_search = self._downloader.params.get('default_search')
1588 if default_search is None:
1589 default_search = 'fixup_error'
1591 if default_search in ('auto', 'auto_warning', 'fixup_error'):
1593 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
1594 return self.url_result('http://' + url)
1595 elif default_search != 'fixup_error':
1596 if default_search == 'auto_warning':
1597 if re.match(r'^(?:url|URL)$', url):
1598 raise ExtractorError(
1599 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
1602 self._downloader.report_warning(
1603 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
1604 return self.url_result('ytsearch:' + url)
1606 if default_search in ('error', 'fixup_error'):
1607 raise ExtractorError(
1608 '%r is not a valid URL. '
1609 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
1610 % (url, url), expected=True)
1612 if ':' not in default_search:
1613 default_search += ':'
1614 return self.url_result(default_search + url)
1616 url, smuggled_data = unsmuggle_url(url)
1617 force_videoid = None
1618 is_intentional = smuggled_data and smuggled_data.get('to_generic')
1619 if smuggled_data and 'force_videoid' in smuggled_data:
1620 force_videoid = smuggled_data['force_videoid']
1621 video_id = force_videoid
1623 video_id = self._generic_id(url)
1625 self.to_screen('%s: Requesting header' % video_id)
1627 head_req = HEADRequest(url)
1628 head_response = self._request_webpage(
1630 note=False, errnote='Could not send HEAD request to %s' % url,
1633 if head_response is not False:
1634 # Check for redirect
1635 new_url = head_response.geturl()
1637 self.report_following_redirect(new_url)
1639 new_url = smuggle_url(
1640 new_url, {'force_videoid': force_videoid})
1641 return self.url_result(new_url)
1643 full_response = None
1644 if head_response is False:
1645 request = sanitized_Request(url)
1646 request.add_header('Accept-Encoding', '*')
1647 full_response = self._request_webpage(request, video_id)
1648 head_response = full_response
1652 'title': self._generic_title(url),
1653 'upload_date': unified_strdate(head_response.headers.get('Last-Modified'))
1656 # Check for direct link to a video
1657 content_type = head_response.headers.get('Content-Type', '').lower()
1658 m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
1660 format_id = m.group('format_id')
1661 if format_id.endswith('mpegurl'):
1662 formats = self._extract_m3u8_formats(url, video_id, 'mp4')
1663 elif format_id == 'f4m':
1664 formats = self._extract_f4m_formats(url, video_id)
1667 'format_id': m.group('format_id'),
1669 'vcodec': 'none' if m.group('type') == 'audio' else None
1671 info_dict['direct'] = True
1672 self._sort_formats(formats)
1673 info_dict['formats'] = formats
1676 if not self._downloader.params.get('test', False) and not is_intentional:
1677 force = self._downloader.params.get('force_generic_extractor', False)
1678 self._downloader.report_warning(
1679 '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
1681 if not full_response:
1682 request = sanitized_Request(url)
1683 # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
1684 # making it impossible to download only chunk of the file (yet we need only 512kB to
1685 # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
1686 # that will always result in downloading the whole file that is not desirable.
1687 # Therefore for extraction pass we have to override Accept-Encoding to any in order
1688 # to accept raw bytes and being able to download only a chunk.
1689 # It may probably better to solve this by checking Content-Type for application/octet-stream
1690 # after HEAD request finishes, but not sure if we can rely on this.
1691 request.add_header('Accept-Encoding', '*')
1692 full_response = self._request_webpage(request, video_id)
1694 first_bytes = full_response.read(512)
1696 # Is it an M3U playlist?
1697 if first_bytes.startswith(b'#EXTM3U'):
1698 info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4')
1699 self._sort_formats(info_dict['formats'])
1702 # Maybe it's a direct link to a video?
1703 # Be careful not to download the whole thing!
1704 if not is_html(first_bytes):
1705 self._downloader.report_warning(
1706 'URL could be a direct video link, returning it as such.')
1713 webpage = self._webpage_read_content(
1714 full_response, url, video_id, prefix=first_bytes)
1716 self.report_extraction(video_id)
1718 # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest?
1720 doc = compat_etree_fromstring(webpage.encode('utf-8'))
1721 if doc.tag == 'rss':
1722 return self._extract_rss(url, video_id, doc)
1723 elif doc.tag == 'SmoothStreamingMedia':
1724 info_dict['formats'] = self._parse_ism_formats(doc, url)
1725 self._sort_formats(info_dict['formats'])
1727 elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
1728 smil = self._parse_smil(doc, url, video_id)
1729 self._sort_formats(smil['formats'])
1731 elif doc.tag == '{http://xspf.org/ns/0/}playlist':
1732 return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
1733 elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
1734 info_dict['formats'] = self._parse_mpd_formats(
1736 mpd_base_url=full_response.geturl().rpartition('/')[0],
1738 self._sort_formats(info_dict['formats'])
1740 elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
1741 info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)
1742 self._sort_formats(info_dict['formats'])
1744 except compat_xml_parse_error:
1747 # Is it a Camtasia project?
1748 camtasia_res = self._extract_camtasia(url, video_id, webpage)
1749 if camtasia_res is not None:
1752 # Sometimes embedded video player is hidden behind percent encoding
1753 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
1754 # Unescaping the whole page allows to handle those cases in a generic way
1755 webpage = compat_urllib_parse_unquote(webpage)
1757 # it's tempting to parse this further, but you would
1758 # have to take into account all the variations like
1759 # Video Title - Site Name
1760 # Site Name | Video Title
1761 # Video Title - Tagline | Site Name
1762 # and so on and so forth; it's just not practical
1763 video_title = self._og_search_title(
1764 webpage, default=None) or self._html_search_regex(
1765 r'(?s)<title>(.*?)</title>', webpage, 'video title',
1768 # Try to detect age limit automatically
1769 age_limit = self._rta_search(webpage)
1770 # And then there are the jokers who advertise that they use RTA,
1771 # but actually don't.
1772 AGE_LIMIT_MARKERS = [
1773 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
1775 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
1778 # video uploader is domain name
1779 video_uploader = self._search_regex(
1780 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
1782 video_description = self._og_search_description(webpage, default=None)
1783 video_thumbnail = self._og_search_thumbnail(webpage, default=None)
1786 def _playlist_from_matches(matches, getter=None, ie=None):
1788 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1790 return self.playlist_result(
1791 urlrs, playlist_id=video_id, playlist_title=video_title)
1793 # Look for Brightcove Legacy Studio embeds
1794 bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
1796 self.to_screen('Brightcove video detected.')
1799 'url': smuggle_url(bc_url, {'Referer': url}),
1800 'ie_key': 'BrightcoveLegacy'
1801 } for bc_url in bc_urls]
1804 '_type': 'playlist',
1805 'title': video_title,
1810 # Look for Brightcove New Studio embeds
1811 bc_urls = BrightcoveNewIE._extract_urls(webpage)
1813 return _playlist_from_matches(bc_urls, ie='BrightcoveNew')
1815 # Look for ThePlatform embeds
1816 tp_urls = ThePlatformIE._extract_urls(webpage)
1818 return _playlist_from_matches(tp_urls, ie='ThePlatform')
1820 # Look for Vessel embeds
1821 vessel_urls = VesselIE._extract_urls(webpage)
1823 return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key())
1825 # Look for embedded rtl.nl player
1826 matches = re.findall(
1827 r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
1830 return _playlist_from_matches(matches, ie='RtlNl')
1832 vimeo_urls = VimeoIE._extract_urls(url, webpage)
1834 return _playlist_from_matches(vimeo_urls, ie=VimeoIE.ie_key())
1836 vid_me_embed_url = self._search_regex(
1837 r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
1838 webpage, 'vid.me embed', default=None)
1839 if vid_me_embed_url is not None:
1840 return self.url_result(vid_me_embed_url, 'Vidme')
1842 # Look for embedded YouTube player
1843 matches = re.findall(r'''(?x)
1852 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1856 return _playlist_from_matches(
1857 matches, lambda m: unescapeHTML(m[1]))
1859 # Look for lazyYT YouTube embed
1860 matches = re.findall(
1861 r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1863 return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1865 # Look for Wordpress "YouTube Video Importer" plugin
1866 matches = re.findall(r'''(?x)<div[^>]+
1867 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1868 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1870 return _playlist_from_matches(matches, lambda m: m[-1])
1872 matches = DailymotionIE._extract_urls(webpage)
1874 return _playlist_from_matches(matches)
1876 # Look for embedded Dailymotion playlist player (#3822)
1878 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1880 playlists = re.findall(
1881 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1883 return _playlist_from_matches(
1884 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1886 # Look for embedded Wistia player
1888 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
1890 embed_url = self._proto_relative_url(
1891 unescapeHTML(match.group('url')))
1893 '_type': 'url_transparent',
1896 'uploader': video_uploader,
1899 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
1902 '_type': 'url_transparent',
1903 'url': 'wistia:%s' % match.group('id'),
1905 'uploader': video_uploader,
1910 <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
1911 <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2
1914 return self.url_result(self._proto_relative_url(
1915 'wistia:%s' % match.group('id')), 'Wistia')
1917 # Look for SVT player
1918 svt_url = SVTIE._extract_url(webpage)
1920 return self.url_result(svt_url, 'SVT')
1922 # Look for embedded condenast player
1923 matches = re.findall(
1924 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1928 '_type': 'playlist',
1931 'ie_key': 'CondeNast',
1933 } for ma in matches],
1934 'title': video_title,
1938 # Look for Bandcamp pages with custom domain
1939 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1940 if mobj is not None:
1941 burl = unescapeHTML(mobj.group(1))
1942 # Don't set the extractor because it can be a track url or an album
1943 return self.url_result(burl)
1945 # Look for embedded Vevo player
1947 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1948 if mobj is not None:
1949 return self.url_result(mobj.group('url'))
1951 # Look for embedded Viddler player
1953 r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1955 if mobj is not None:
1956 return self.url_result(mobj.group('url'))
1958 # Look for NYTimes player
1960 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1962 if mobj is not None:
1963 return self.url_result(mobj.group('url'))
1965 # Look for Libsyn player
1967 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1968 if mobj is not None:
1969 return self.url_result(mobj.group('url'))
1971 # Look for Ooyala videos
1972 mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1973 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1974 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1975 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
1976 if mobj is not None:
1977 embed_token = self._search_regex(
1978 r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)',
1979 webpage, 'ooyala embed token', default=None)
1980 return OoyalaIE._build_url_result(smuggle_url(
1983 'embed_token': embed_token,
1986 # Look for multiple Ooyala embeds on SBN network websites
1987 mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1988 if mobj is not None:
1989 embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1991 return _playlist_from_matches(
1992 embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')
1994 # Look for Aparat videos
1995 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1996 if mobj is not None:
1997 return self.url_result(mobj.group(1), 'Aparat')
1999 # Look for MPORA videos
2000 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
2001 if mobj is not None:
2002 return self.url_result(mobj.group(1), 'Mpora')
2004 # Look for embedded NovaMov-based player
2006 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
2007 (?P<url>http://(?:(?:embed|www)\.)?
2009 nowvideo\.(?:ch|sx|eu|at|ag|co)|
2010 videoweed\.(?:es|com)|
2011 movshare\.(?:net|sx|ag)|
2012 divxstage\.(?:eu|net|ch|co|at|ag))
2013 /embed\.php.+?)\1''', webpage)
2014 if mobj is not None:
2015 return self.url_result(mobj.group('url'))
2017 # Look for embedded Facebook player
2018 facebook_url = FacebookIE._extract_url(webpage)
2019 if facebook_url is not None:
2020 return self.url_result(facebook_url, 'Facebook')
2022 # Look for embedded VK player
2023 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
2024 if mobj is not None:
2025 return self.url_result(mobj.group('url'), 'VK')
2027 # Look for embedded Odnoklassniki player
2028 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage)
2029 if mobj is not None:
2030 return self.url_result(mobj.group('url'), 'Odnoklassniki')
2032 # Look for embedded ivi player
2033 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
2034 if mobj is not None:
2035 return self.url_result(mobj.group('url'), 'Ivi')
2037 # Look for embedded Huffington Post player
2039 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
2040 if mobj is not None:
2041 return self.url_result(mobj.group('url'), 'HuffPost')
2044 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
2045 if mobj is not None:
2046 return self.url_result(mobj.group('url'))
2047 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
2048 if mobj is not None:
2049 return self.url_result(compat_urllib_parse_unquote(mobj.group('url')))
2051 # Look for funnyordie embed
2052 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
2054 return _playlist_from_matches(
2055 matches, getter=unescapeHTML, ie='FunnyOrDie')
2057 # Look for BBC iPlayer embed
2058 matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
2060 return _playlist_from_matches(matches, ie='BBCCoUk')
2062 # Look for embedded RUTV player
2063 rutv_url = RUTVIE._extract_url(webpage)
2065 return self.url_result(rutv_url, 'RUTV')
2067 # Look for embedded TVC player
2068 tvc_url = TVCIE._extract_url(webpage)
2070 return self.url_result(tvc_url, 'TVC')
2072 # Look for embedded SportBox player
2073 sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
2075 return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
2077 # Look for embedded XHamster player
2078 xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
2080 return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
2082 # Look for embedded TNAFlixNetwork player
2083 tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage)
2085 return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key())
2087 # Look for embedded PornHub player
2088 pornhub_urls = PornHubIE._extract_urls(webpage)
2090 return _playlist_from_matches(pornhub_urls, ie=PornHubIE.ie_key())
2092 # Look for embedded DrTuber player
2093 drtuber_urls = DrTuberIE._extract_urls(webpage)
2095 return _playlist_from_matches(drtuber_urls, ie=DrTuberIE.ie_key())
2097 # Look for embedded RedTube player
2098 redtube_urls = RedTubeIE._extract_urls(webpage)
2100 return _playlist_from_matches(redtube_urls, ie=RedTubeIE.ie_key())
2102 # Look for embedded Tvigle player
2104 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
2105 if mobj is not None:
2106 return self.url_result(mobj.group('url'), 'Tvigle')
2108 # Look for embedded TED player
2110 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
2111 if mobj is not None:
2112 return self.url_result(mobj.group('url'), 'TED')
2114 # Look for embedded Ustream videos
2116 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
2117 if mobj is not None:
2118 return self.url_result(mobj.group('url'), 'Ustream')
2120 # Look for embedded arte.tv player
2122 r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"',
2124 if mobj is not None:
2125 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
2127 # Look for embedded francetv player
2129 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1',
2131 if mobj is not None:
2132 return self.url_result(mobj.group('url'))
2134 # Look for embedded smotri.com player
2135 smotri_url = SmotriIE._extract_url(webpage)
2137 return self.url_result(smotri_url, 'Smotri')
2139 # Look for embedded Myvi.ru player
2140 myvi_url = MyviIE._extract_url(webpage)
2142 return self.url_result(myvi_url)
2144 # Look for embedded soundcloud player
2145 soundcloud_urls = SoundcloudIE._extract_urls(webpage)
2147 return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key())
2149 # Look for tunein player
2150 tunein_urls = TuneInBaseIE._extract_urls(webpage)
2152 return _playlist_from_matches(tunein_urls)
2154 # Look for embedded mtvservices player
2155 mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
2157 return self.url_result(mtvservices_url, ie='MTVServicesEmbedded')
2159 # Look for embedded yahoo player
2161 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
2163 if mobj is not None:
2164 return self.url_result(mobj.group('url'), 'Yahoo')
2166 # Look for embedded sbs.com.au player
2170 <meta\s+property="og:video"\s+content=|
2173 (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
2175 if mobj is not None:
2176 return self.url_result(mobj.group('url'), 'SBS')
2178 # Look for embedded Cinchcast player
2180 r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
2182 if mobj is not None:
2183 return self.url_result(mobj.group('url'), 'Cinchcast')
2186 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
2190 r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
2192 if mobj is not None:
2193 return self.url_result(mobj.group('url'), 'MLB')
2196 r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
2198 if mobj is not None:
2199 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
2202 r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"',
2204 if mobj is not None:
2205 return self.url_result(mobj.group('url'), 'Livestream')
2207 # Look for Zapiks embed
2209 r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
2210 if mobj is not None:
2211 return self.url_result(mobj.group('url'), 'Zapiks')
2213 # Look for Kaltura embeds
2214 kaltura_url = KalturaIE._extract_url(webpage)
2216 return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key())
2218 # Look for Eagle.Platform embeds
2219 eagleplatform_url = EaglePlatformIE._extract_url(webpage)
2220 if eagleplatform_url:
2221 return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key())
2223 # Look for ClipYou (uses Eagle.Platform) embeds
2225 r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
2226 if mobj is not None:
2227 return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
2229 # Look for Pladform embeds
2230 pladform_url = PladformIE._extract_url(webpage)
2232 return self.url_result(pladform_url)
2234 # Look for Videomore embeds
2235 videomore_url = VideomoreIE._extract_url(webpage)
2237 return self.url_result(videomore_url)
2239 # Look for Webcaster embeds
2240 webcaster_url = WebcasterFeedIE._extract_url(self, webpage)
2242 return self.url_result(webcaster_url, ie=WebcasterFeedIE.ie_key())
2244 # Look for Playwire embeds
2246 r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
2247 if mobj is not None:
2248 return self.url_result(mobj.group('url'))
2250 # Look for 5min embeds
2252 r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
2253 if mobj is not None:
2254 return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
2256 # Look for Crooks and Liars embeds
2258 r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
2259 if mobj is not None:
2260 return self.url_result(mobj.group('url'))
2262 # Look for NBC Sports VPlayer embeds
2263 nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
2265 return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
2267 # Look for NBC News embeds
2268 nbc_news_embed_url = re.search(
2269 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1', webpage)
2270 if nbc_news_embed_url:
2271 return self.url_result(nbc_news_embed_url.group('url'), 'NBCNews')
2273 # Look for Google Drive embeds
2274 google_drive_url = GoogleDriveIE._extract_url(webpage)
2275 if google_drive_url:
2276 return self.url_result(google_drive_url, 'GoogleDrive')
2278 # Look for UDN embeds
2280 r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage)
2281 if mobj is not None:
2282 return self.url_result(
2283 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
2285 # Look for Senate ISVP iframe
2286 senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
2288 return self.url_result(senate_isvp_url, 'SenateISVP')
2290 # Look for Dailymotion Cloud videos
2291 dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
2293 return self.url_result(dmcloud_url, 'DailymotionCloud')
2295 # Look for OnionStudios embeds
2296 onionstudios_url = OnionStudiosIE._extract_url(webpage)
2297 if onionstudios_url:
2298 return self.url_result(onionstudios_url)
2300 # Look for ViewLift embeds
2301 viewlift_url = ViewLiftEmbedIE._extract_url(webpage)
2303 return self.url_result(viewlift_url)
2305 # Look for JWPlatform embeds
2306 jwplatform_url = JWPlatformIE._extract_url(webpage)
2308 return self.url_result(jwplatform_url, 'JWPlatform')
2310 # Look for Digiteka embeds
2311 digiteka_url = DigitekaIE._extract_url(webpage)
2313 return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key())
2315 # Look for Arkena embeds
2316 arkena_url = ArkenaIE._extract_url(webpage)
2318 return self.url_result(arkena_url, ArkenaIE.ie_key())
2320 # Look for Piksel embeds
2321 piksel_url = PikselIE._extract_url(webpage)
2323 return self.url_result(piksel_url, PikselIE.ie_key())
2325 # Look for Limelight embeds
2326 mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage)
2330 'Channel': 'channel',
2331 'ChannelList': 'channel_list',
2333 return self.url_result('limelight:%s:%s' % (
2334 lm[mobj.group(1)], mobj.group(2)), 'Limelight%s' % mobj.group(1), mobj.group(2))
2338 <object[^>]+class=(["\'])LimelightEmbeddedPlayerFlash\1[^>]*>.*?
2340 name=(["\'])flashVars\2[^>]+
2341 value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32})
2344 return self.url_result('limelight:media:%s' % mobj.group('id'))
2346 # Look for AdobeTVVideo embeds
2348 r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
2350 if mobj is not None:
2351 return self.url_result(
2352 self._proto_relative_url(unescapeHTML(mobj.group(1))),
2355 # Look for Vine embeds
2357 r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))',
2359 if mobj is not None:
2360 return self.url_result(
2361 self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine')
2363 # Look for VODPlatform embeds
2365 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vod-platform\.net/[eE]mbed/.+?)\1',
2367 if mobj is not None:
2368 return self.url_result(
2369 self._proto_relative_url(unescapeHTML(mobj.group('url'))), 'VODPlatform')
2371 # Look for Mangomolo embeds
2373 r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?admin\.mangomolo\.com/analytics/index\.php/customers/embed/
2375 video\?.*?\bid=(?P<video_id>\d+)|
2376 index\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)
2377 ).+?)\1''', webpage)
2378 if mobj is not None:
2380 '_type': 'url_transparent',
2381 'url': self._proto_relative_url(unescapeHTML(mobj.group('url'))),
2382 'title': video_title,
2383 'description': video_description,
2384 'thumbnail': video_thumbnail,
2385 'uploader': video_uploader,
2387 video_id = mobj.group('video_id')
2390 'ie_key': 'MangomoloVideo',
2395 'ie_key': 'MangomoloLive',
2396 'id': mobj.group('channel_id'),
2400 # Look for Instagram embeds
2401 instagram_embed_url = InstagramIE._extract_embed_url(webpage)
2402 if instagram_embed_url is not None:
2403 return self.url_result(
2404 self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key())
2406 # Look for LiveLeak embeds
2407 liveleak_url = LiveLeakIE._extract_url(webpage)
2409 return self.url_result(liveleak_url, 'LiveLeak')
2411 # Look for 3Q SDN embeds
2412 threeqsdn_url = ThreeQSDNIE._extract_url(webpage)
2415 '_type': 'url_transparent',
2416 'ie_key': ThreeQSDNIE.ie_key(),
2417 'url': self._proto_relative_url(threeqsdn_url),
2418 'title': video_title,
2419 'description': video_description,
2420 'thumbnail': video_thumbnail,
2421 'uploader': video_uploader,
2424 # Look for VBOX7 embeds
2425 vbox7_url = Vbox7IE._extract_url(webpage)
2427 return self.url_result(vbox7_url, Vbox7IE.ie_key())
2429 # Look for DBTV embeds
2430 dbtv_urls = DBTVIE._extract_urls(webpage)
2432 return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key())
2434 # Look for Videa embeds
2435 videa_urls = VideaIE._extract_urls(webpage)
2437 return _playlist_from_matches(videa_urls, ie=VideaIE.ie_key())
2439 # Look for 20 minuten embeds
2440 twentymin_urls = TwentyMinutenIE._extract_urls(webpage)
2442 return _playlist_from_matches(
2443 twentymin_urls, ie=TwentyMinutenIE.ie_key())
2445 # Looking for http://schema.org/VideoObject
2446 json_ld = self._search_json_ld(
2447 webpage, video_id, default={}, expected_type='VideoObject')
2448 if json_ld.get('url'):
2450 'title': video_title or info_dict['title'],
2451 'description': video_description,
2452 'thumbnail': video_thumbnail,
2453 'age_limit': age_limit
2455 info_dict.update(json_ld)
2458 # Look for HTML5 media
2459 entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
2461 for entry in entries:
2464 'title': video_title,
2466 self._sort_formats(entry['formats'])
2467 return self.playlist_result(entries)
2469 def check_video(vurl):
2470 if YoutubeIE.suitable(vurl):
2472 vpath = compat_urlparse.urlparse(vurl).path
2473 vext = determine_ext(vpath)
2474 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js')
2476 def filter_video(urls):
2477 return list(filter(check_video, urls))
2479 # Start with something easy: JW Player in SWFObject
2480 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
2482 # Look for gorilla-vid style embedding
2483 found = filter_video(re.findall(r'''(?sx)
2487 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
2490 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
2492 # Broaden the search a little bit
2493 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
2495 # Broaden the findall a little bit: JWPlayer JS loader
2496 found = filter_video(re.findall(
2497 r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
2500 found = filter_video(re.findall(r'''(?xs)
2501 flowplayer\("[^"]+",\s*
2503 \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
2504 ["']?url["']?\s*:\s*["']([^"']+)["']
2509 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
2511 # Try to find twitter cards info
2512 # twitter:player:stream should be checked before twitter:player since
2513 # it is expected to contain a raw stream (see
2514 # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
2515 found = filter_video(re.findall(
2516 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
2518 # We look for Open Graph info:
2519 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
2520 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
2521 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
2522 if m_video_type is not None:
2523 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
2525 REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
2527 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
2528 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
2531 # Look also in Refresh HTTP header
2532 refresh_header = head_response.headers.get('Refresh')
2534 # In python 2 response HTTP headers are bytestrings
2535 if sys.version_info < (3, 0) and isinstance(refresh_header, str):
2536 refresh_header = refresh_header.decode('iso-8859-1')
2537 found = re.search(REDIRECT_REGEX, refresh_header)
2539 new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1)))
2540 self.report_following_redirect(new_url)
2547 # twitter:player is a https URL to iframe player that may or may not
2548 # be supported by youtube-dl thus this is checked the very last (see
2549 # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
2550 embed_url = self._html_search_meta('twitter:player', webpage, default=None)
2552 return self.url_result(embed_url)
2555 raise UnsupportedError(url)
2558 for video_url in orderedSet(found):
2559 video_url = unescapeHTML(video_url)
2560 video_url = video_url.replace('\\/', '/')
2561 video_url = compat_urlparse.urljoin(url, video_url)
2562 video_id = compat_urllib_parse_unquote(os.path.basename(video_url))
2564 # Sometimes, jwplayer extraction will result in a YouTube URL
2565 if YoutubeIE.suitable(video_url):
2566 entries.append(self.url_result(video_url, 'Youtube'))
2569 # here's a fun little line of code for you:
2570 video_id = os.path.splitext(video_id)[0]
2574 'uploader': video_uploader,
2575 'title': video_title,
2576 'age_limit': age_limit,
2579 ext = determine_ext(video_url)
2581 entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id)
2583 return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
2585 entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
2587 entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id)
2589 entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id)
2590 elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url:
2591 # Just matching .ism/manifest is not enough to be reliably sure
2592 # whether it's actually an ISM manifest or some other streaming
2593 # manifest since there are various streaming URL formats
2594 # possible (see [1]) as well as some other shenanigans like
2595 # .smil/manifest URLs that actually serve an ISM (see [2]) and
2597 # Thus the most reasonable way to solve this is to delegate
2598 # to generic extractor in order to look into the contents of
2599 # the manifest itself.
2600 # 1. https://azure.microsoft.com/en-us/documentation/articles/media-services-deliver-content-overview/#streaming-url-formats
2601 # 2. https://svs.itworkscdn.net/lbcivod/smil:itwfcdn/lbci/170976.smil/Manifest
2602 entry_info_dict = self.url_result(
2603 smuggle_url(video_url, {'to_generic': True}),
2606 entry_info_dict['url'] = video_url
2608 if entry_info_dict.get('formats'):
2609 self._sort_formats(entry_info_dict['formats'])
2611 entries.append(entry_info_dict)
2613 if len(entries) == 1:
2616 for num, e in enumerate(entries, start=1):
2617 # 'url' results don't have a title
2618 if e.get('title') is not None:
2619 e['title'] = '%s (%d)' % (e['title'], num)
2621 '_type': 'playlist',