3 from __future__ import unicode_literals
8 from .common import InfoExtractor
9 from .youtube import YoutubeIE
10 from ..compat import (
12 compat_urllib_parse_unquote,
13 compat_urllib_request,
15 compat_xml_parse_error,
33 from .brightcove import BrightcoveIE
34 from .nbc import NBCSportsVPlayerIE
35 from .ooyala import OoyalaIE
36 from .rutv import RUTVIE
37 from .tvc import TVCIE
38 from .sportbox import SportBoxEmbedIE
39 from .smotri import SmotriIE
40 from .condenast import CondeNastIE
41 from .udn import UDNEmbedIE
42 from .senateisvp import SenateISVPIE
43 from .bliptv import BlipTVIE
44 from .svt import SVTIE
45 from .pornhub import PornHubIE
46 from .xhamster import XHamsterEmbedIE
47 from .vimeo import VimeoIE
48 from .dailymotion import DailymotionCloudIE
49 from .onionstudios import OnionStudiosIE
50 from .snagfilms import SnagFilmsEmbedIE
53 class GenericIE(InfoExtractor):
54 IE_DESC = 'Generic downloader that works on some sites'
58 # Direct link to a video
60 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
61 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
66 'upload_date': '20100513',
69 # Direct link to media delivered compressed (until Accept-Encoding is *)
71 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
72 'md5': '128c42e68b13950268b648275386fc74',
74 'id': 'FictionJunction-Parallel_Hearts',
76 'title': 'FictionJunction-Parallel_Hearts',
77 'upload_date': '20140522',
79 'expected_warnings': [
80 'URL could be a direct video link, returning it as such.'
83 # Direct download with broken HEAD
85 'url': 'http://ai-radio.org:8000/radio.opus',
92 'skip_download': True, # infinite live stream
94 'expected_warnings': [
95 r'501.*Not Implemented'
98 # Direct link with incorrect MIME type
100 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
101 'md5': '4ccbebe5f36706d85221f204d7eb5913',
103 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
104 'id': '5_Lennart_Poettering_-_Systemd',
106 'title': '5_Lennart_Poettering_-_Systemd',
107 'upload_date': '20141120',
109 'expected_warnings': [
110 'URL could be a direct video link, returning it as such.'
115 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
117 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
118 'title': 'Zero Punctuation',
119 'description': 're:.*groundbreaking video review series.*'
121 'playlist_mincount': 11,
123 # RSS feed with enclosure
125 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
127 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
129 'upload_date': '20150228',
130 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
135 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
139 'upload_date': '20130224',
140 'uploader_id': 'TheVerge',
141 'description': 're:^Chris Ziegler takes a look at the\.*',
142 'uploader': 'The Verge',
143 'title': 'First Firefox OS phones side-by-side',
146 'skip_download': False,
150 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
151 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
153 'id': '13601338388002',
155 'uploader': 'www.hodiho.fr',
156 'title': 'R\u00e9gis plante sa Jeep',
159 # bandcamp page with custom domain
161 'add_ie': ['Bandcamp'],
162 'url': 'http://bronyrock.com/track/the-pony-mash',
166 'title': 'The Pony Mash',
167 'uploader': 'M_Pallante',
169 'skip': 'There is a limit of 200 free downloads / month for the test song',
171 # embedded brightcove video
172 # it also tests brightcove videos that need to set the 'Referer' in the
175 'add_ie': ['Brightcove'],
176 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
178 'id': '2765128793001',
180 'title': 'Le cours de bourse : l’analyse technique',
181 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
182 'uploader': 'BFM BUSINESS',
185 'skip_download': True,
189 # https://github.com/rg3/youtube-dl/issues/2253
190 'url': 'http://bcove.me/i6nfkrc3',
191 'md5': '0ba9446db037002366bab3b3eb30c88c',
193 'id': '3101154703001',
195 'title': 'Still no power',
196 'uploader': 'thestar.com',
197 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
199 'add_ie': ['Brightcove'],
202 'url': 'http://www.championat.com/video/football/v/87/87499.html',
203 'md5': 'fb973ecf6e4a78a67453647444222983',
205 'id': '3414141473001',
207 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
208 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
209 'uploader': 'Championat',
213 # https://github.com/rg3/youtube-dl/issues/3541
214 'add_ie': ['Brightcove'],
215 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
217 'id': '3866516442001',
219 'title': 'Leer mij vrouwen kennen: Aflevering 1',
220 'description': 'Leer mij vrouwen kennen: Aflevering 1',
221 'uploader': 'SBS Broadcasting',
223 'skip': 'Restricted to Netherlands',
225 'skip_download': True, # m3u8 download
230 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
231 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
233 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
235 'title': '2cc213299525360.mov', # that's what we get
237 'add_ie': ['Ooyala'],
239 # multiple ooyala embeds on SBN network websites
241 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
243 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
244 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
246 'playlist_mincount': 3,
248 'skip_download': True,
250 'add_ie': ['Ooyala'],
254 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
258 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
259 'upload_date': '20140225',
260 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
261 'uploader': 'Tested',
262 'uploader_id': 'testedcom',
264 # No need to test YoutubeIE here
266 'skip_download': True,
271 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
275 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
276 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
281 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
283 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
285 'playlist_mincount': 18,
289 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
293 'title': 'Охотское море стало целиком российским',
294 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
298 'skip_download': True,
303 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/',
307 'title': 'Дошкольное воспитание',
312 'url': 'http://www.vestifinance.ru/articles/25753',
315 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
320 'title': 'Госзаказ. День 3',
326 'title': 'Госзаказ. День 2',
332 'title': 'Госзаказ. День 1',
338 'skip_download': True,
343 'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8',
346 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )',
348 'playlist_mincount': 7,
352 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
353 'md5': '65fdff94098e4a607385a60c5177c638',
357 'title': 'Hidden miracles of the natural world',
358 'uploader': 'Louie Schwartzberg',
359 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
362 # Embeded Ustream video
364 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
365 'md5': '27b99cdb639c9b12a79bca876a073417',
369 'uploader': 'AU SPA: The NSA and Privacy',
370 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
373 # nowvideo embed hidden behind percent encoding
375 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
376 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
378 'id': '06e53103ca9aa',
380 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
381 'description': 'No description',
386 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
387 'md5': '7653032cbb25bf6c80d80f217055fa43',
389 'id': '048195-004_PLUS7-F',
392 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
393 'upload_date': '20140320',
396 'skip_download': 'Requires rtmpdump'
401 'url': 'http://www.wired.com/2014/04/honda-asimo/',
402 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
404 'id': '53501be369702d3275860000',
406 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
411 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
412 'md5': '441aeeb82eb72c422c7f14ec533999cd',
414 'id': 'k2mm4bCdJ6CQ2i7c8o2',
416 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
419 'add_ie': ['Dailymotion'],
423 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
427 'title': 'The NBL Auction 2014',
428 'uploader': 'BADMINTON England',
429 'uploader_id': 'BADMINTONEvents',
430 'upload_date': '20140603',
431 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
433 'add_ie': ['Youtube'],
435 'skip_download': True,
440 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
441 'md5': '35727f82f58c76d996fc188f9755b0d5',
443 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
446 'description': 'Mario\'s life in the fast lane has never looked so good.',
449 # YouTube embed via <data-embed-url="">
451 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
455 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
456 'uploader': 'Gameloft',
457 'uploader_id': 'gameloft',
458 'upload_date': '20140828',
459 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
462 'skip_download': True,
467 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
469 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
471 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
472 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
477 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
479 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
480 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
486 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
491 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
492 'md5': '9d65602bf31c6e20014319c7d07fba27',
494 'id': '5123ea6d5e5a7',
497 'uploader': 'www.handjobhub.com',
498 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
501 # Multiple brightcove videos
502 # https://github.com/rg3/youtube-dl/issues/2283
504 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
506 'id': 'always-never',
507 'title': 'Always / Never - The New Yorker',
511 'extract_flat': False,
512 'skip_download': True,
517 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
518 'md5': '96f09a37e44da40dd083e12d9a683327',
522 'title': 'Ump changes call to ball',
523 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
525 'timestamp': 1401537900,
526 'upload_date': '20140531',
527 'thumbnail': 're:^https?://.*\.jpg$',
532 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
533 'md5': '8788b683c777a5cf25621eaf286d0c23',
537 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
539 'filesize': 182808282,
540 'uploader': 'education-portal.com',
544 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
545 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
549 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
551 'uploader': 'thoughtworks.wistia.com',
556 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
560 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
561 'uploader': 'Sophos Security',
562 'title': 'Chet Chat 171 - Oct 29, 2014',
563 'upload_date': '20141029',
568 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
572 'upload_date': '20141112',
573 'title': 'Rosetta #CometLanding webcast HL 10',
578 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
581 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
583 'playlist_mincount': 2,
587 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
591 'upload_date': '20141126',
592 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
597 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
599 'id': '730m_DandD_1901_512k',
601 'uploader': 'www.abc.net.au',
602 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
605 # embedded viddler video
607 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
611 'uploader': 'deadspin',
612 'title': 'WALL-TO-GORTAT',
613 'timestamp': 1422285291,
614 'upload_date': '20150126',
616 'add_ie': ['Viddler'],
620 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
624 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
625 'description': 'md5:601cb790edd05908957dae8aaa866465',
626 'upload_date': '20150220',
631 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
635 'upload_date': '20150212',
636 'uploader': 'The National Archives UK',
637 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
638 'uploader_id': 'NationalArchives08',
639 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
644 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
645 'playlist_mincount': 5,
647 'id': 'aanslagen-kopenhagen',
648 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
653 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
657 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
662 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
666 'upload_date': '20150226',
667 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
669 'title': 'John Carlson Postgame 2/25/15',
672 # Eagle.Platform embed (generic URL)
674 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
678 'title': 'Навальный вышел на свободу',
679 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
680 'thumbnail': 're:^https?://.*\.jpg$',
686 # ClipYou (Eagle.Platform) embed (custom URL)
688 'url': 'http://muz-tv.ru/play/7129/',
692 'title': "'O Sole Mio",
693 'thumbnail': 're:^https?://.*\.jpg$',
700 'url': 'http://muz-tv.ru/kinozal/view/7400/',
704 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
705 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
706 'thumbnail': 're:^https?://.*\.jpg$',
713 'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
717 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
718 'thumbnail': 're:^https?://.*\.png$',
724 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
725 'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
729 'title': 'Facebook Creates "On This Day" | Crunch Report',
734 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
738 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
743 # Crooks and Liars embed
745 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
749 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
750 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
751 'timestamp': 1428207000,
752 'upload_date': '20150405',
753 'uploader': 'Heather',
756 # Crooks and Liars external embed
758 'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
760 'id': 'MTE3MjUtMzQ2MzA',
762 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
763 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
764 'timestamp': 1265032391,
765 'upload_date': '20100201',
766 'uploader': 'Heather',
769 # NBC Sports vplayer embed
771 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
773 'id': 'ln7x1qSThw4k',
775 'title': "PFT Live: New leader in the 'new-look' defense",
776 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
781 'url': 'http://www.udn.com/news/story/7314/822787',
782 'md5': 'fd2060e988c326991037b9aff9df21a6',
786 'title': '中一中男師變性 全校師生力挺',
787 'thumbnail': 're:^https?://.*\.jpg$',
792 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
794 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
796 'description': 'VIDEO: Index/Match versus VLOOKUP.',
797 'title': 'This is what separates the Excel masters from the wannabes',
801 'skip_download': True,
804 # Contains a SMIL manifest
806 'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
810 'title': '+ Football: Lottery Champions League Europe',
811 'uploader': 'www.telewebion.com',
815 'skip_download': True,
818 # Brightcove URL in single quotes
820 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
821 'md5': '4ae374f1f8b91c889c4b9203c8c752af',
823 'id': '4255764656001',
825 'title': 'SN Presents: Russell Martin, World Citizen',
826 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
827 'uploader': 'Rogers Sportsnet',
830 # Dailymotion Cloud video
832 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
833 'md5': '49444254273501a64675a7e68c502681',
835 'id': '5585de919473990de4bee11b',
838 'thumbnail': 're:^https?://.*\.jpe?g$',
843 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537',
847 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You',
848 'thumbnail': 're:^https?://.*\.jpe?g$',
849 'uploader': 'ClickHole',
850 'uploader_id': 'clickhole',
855 'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners',
856 'md5': '43662b577c018ad707a63766462b1e87',
860 'title': 'New experience with Acrobat DC',
861 'description': 'New experience with Acrobat DC',
867 def report_following_redirect(self, new_url):
868 """Report information extraction."""
869 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
871 def _extract_rss(self, url, video_id, doc):
872 playlist_title = doc.find('./channel/title').text
873 playlist_desc_el = doc.find('./channel/description')
874 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
877 for it in doc.findall('./channel/item'):
878 next_url = xpath_text(it, 'link', fatal=False)
880 enclosure_nodes = it.findall('./enclosure')
881 for e in enclosure_nodes:
882 next_url = e.attrib.get('url')
892 'title': it.find('title').text,
898 'title': playlist_title,
899 'description': playlist_desc,
903 def _extract_camtasia(self, url, video_id, webpage):
904 """ Returns None if no camtasia video can be found. """
906 camtasia_cfg = self._search_regex(
907 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
908 webpage, 'camtasia configuration file', default=None)
909 if camtasia_cfg is None:
912 title = self._html_search_meta('DC.title', webpage, fatal=True)
914 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
915 camtasia_cfg = self._download_xml(
916 camtasia_url, video_id,
917 note='Downloading camtasia configuration',
918 errnote='Failed to download camtasia configuration')
919 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
922 for n in fileset_node.getchildren():
923 url_n = n.find('./uri')
928 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
929 'title': '%s - %s' % (title, n.tag),
930 'url': compat_urlparse.urljoin(url, url_n.text),
931 'duration': float_or_none(n.find('./duration').text),
940 def _real_extract(self, url):
941 if url.startswith('//'):
944 'url': self.http_scheme() + url,
947 parsed_url = compat_urlparse.urlparse(url)
948 if not parsed_url.scheme:
949 default_search = self._downloader.params.get('default_search')
950 if default_search is None:
951 default_search = 'fixup_error'
953 if default_search in ('auto', 'auto_warning', 'fixup_error'):
955 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
956 return self.url_result('http://' + url)
957 elif default_search != 'fixup_error':
958 if default_search == 'auto_warning':
959 if re.match(r'^(?:url|URL)$', url):
960 raise ExtractorError(
961 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
964 self._downloader.report_warning(
965 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
966 return self.url_result('ytsearch:' + url)
968 if default_search in ('error', 'fixup_error'):
969 raise ExtractorError(
970 '%r is not a valid URL. '
971 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
972 % (url, url), expected=True)
974 if ':' not in default_search:
975 default_search += ':'
976 return self.url_result(default_search + url)
978 url, smuggled_data = unsmuggle_url(url)
980 is_intentional = smuggled_data and smuggled_data.get('to_generic')
981 if smuggled_data and 'force_videoid' in smuggled_data:
982 force_videoid = smuggled_data['force_videoid']
983 video_id = force_videoid
985 video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
987 self.to_screen('%s: Requesting header' % video_id)
989 head_req = HEADRequest(url)
990 head_response = self._request_webpage(
992 note=False, errnote='Could not send HEAD request to %s' % url,
995 if head_response is not False:
997 new_url = head_response.geturl()
999 self.report_following_redirect(new_url)
1001 new_url = smuggle_url(
1002 new_url, {'force_videoid': force_videoid})
1003 return self.url_result(new_url)
1005 full_response = None
1006 if head_response is False:
1007 request = compat_urllib_request.Request(url)
1008 request.add_header('Accept-Encoding', '*')
1009 full_response = self._request_webpage(request, video_id)
1010 head_response = full_response
1012 # Check for direct link to a video
1013 content_type = head_response.headers.get('Content-Type', '')
1014 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
1016 upload_date = unified_strdate(
1017 head_response.headers.get('Last-Modified'))
1020 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
1023 'format_id': m.group('format_id'),
1025 'vcodec': 'none' if m.group('type') == 'audio' else None
1027 'upload_date': upload_date,
1030 if not self._downloader.params.get('test', False) and not is_intentional:
1031 force = self._downloader.params.get('force_generic_extractor', False)
1032 self._downloader.report_warning(
1033 '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
1035 if not full_response:
1036 request = compat_urllib_request.Request(url)
1037 # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
1038 # making it impossible to download only chunk of the file (yet we need only 512kB to
1039 # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
1040 # that will always result in downloading the whole file that is not desirable.
1041 # Therefore for extraction pass we have to override Accept-Encoding to any in order
1042 # to accept raw bytes and being able to download only a chunk.
1043 # It may probably better to solve this by checking Content-Type for application/octet-stream
1044 # after HEAD request finishes, but not sure if we can rely on this.
1045 request.add_header('Accept-Encoding', '*')
1046 full_response = self._request_webpage(request, video_id)
1048 # Maybe it's a direct link to a video?
1049 # Be careful not to download the whole thing!
1050 first_bytes = full_response.read(512)
1051 if not is_html(first_bytes):
1052 self._downloader.report_warning(
1053 'URL could be a direct video link, returning it as such.')
1054 upload_date = unified_strdate(
1055 head_response.headers.get('Last-Modified'))
1058 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
1061 'upload_date': upload_date,
1064 webpage = self._webpage_read_content(
1065 full_response, url, video_id, prefix=first_bytes)
1067 self.report_extraction(video_id)
1069 # Is it an RSS feed?
1071 doc = parse_xml(webpage)
1072 if doc.tag == 'rss':
1073 return self._extract_rss(url, video_id, doc)
1074 except compat_xml_parse_error:
1077 # Is it a Camtasia project?
1078 camtasia_res = self._extract_camtasia(url, video_id, webpage)
1079 if camtasia_res is not None:
1082 # Sometimes embedded video player is hidden behind percent encoding
1083 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
1084 # Unescaping the whole page allows to handle those cases in a generic way
1085 webpage = compat_urllib_parse.unquote(webpage)
1087 # it's tempting to parse this further, but you would
1088 # have to take into account all the variations like
1089 # Video Title - Site Name
1090 # Site Name | Video Title
1091 # Video Title - Tagline | Site Name
1092 # and so on and so forth; it's just not practical
1093 video_title = self._html_search_regex(
1094 r'(?s)<title>(.*?)</title>', webpage, 'video title',
1097 # Try to detect age limit automatically
1098 age_limit = self._rta_search(webpage)
1099 # And then there are the jokers who advertise that they use RTA,
1100 # but actually don't.
1101 AGE_LIMIT_MARKERS = [
1102 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
1104 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
1107 # video uploader is domain name
1108 video_uploader = self._search_regex(
1109 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
1112 def _playlist_from_matches(matches, getter=None, ie=None):
1114 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1116 return self.playlist_result(
1117 urlrs, playlist_id=video_id, playlist_title=video_title)
1119 # Look for BrightCove:
1120 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
1122 self.to_screen('Brightcove video detected.')
1125 'url': smuggle_url(bc_url, {'Referer': url}),
1126 'ie_key': 'Brightcove'
1127 } for bc_url in bc_urls]
1130 '_type': 'playlist',
1131 'title': video_title,
1136 # Look for embedded rtl.nl player
1137 matches = re.findall(
1138 r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
1141 return _playlist_from_matches(matches, ie='RtlNl')
1143 vimeo_url = VimeoIE._extract_vimeo_url(url, webpage)
1144 if vimeo_url is not None:
1145 return self.url_result(vimeo_url)
1147 # Look for embedded YouTube player
1148 matches = re.findall(r'''(?x)
1157 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1161 return _playlist_from_matches(
1162 matches, lambda m: unescapeHTML(m[1]))
1164 # Look for lazyYT YouTube embed
1165 matches = re.findall(
1166 r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1168 return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1170 # Look for embedded Dailymotion player
1171 matches = re.findall(
1172 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
1174 return _playlist_from_matches(
1175 matches, lambda m: unescapeHTML(m[1]))
1177 # Look for embedded Dailymotion playlist player (#3822)
1179 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1181 playlists = re.findall(
1182 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1184 return _playlist_from_matches(
1185 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1187 # Look for embedded Wistia player
1189 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
1191 embed_url = self._proto_relative_url(
1192 unescapeHTML(match.group('url')))
1194 '_type': 'url_transparent',
1197 'uploader': video_uploader,
1198 'title': video_title,
1202 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
1205 '_type': 'url_transparent',
1206 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1208 'uploader': video_uploader,
1209 'title': video_title,
1210 'id': match.group('id')
1213 # Look for embedded blip.tv player
1214 bliptv_url = BlipTVIE._extract_url(webpage)
1216 return self.url_result(bliptv_url, 'BlipTV')
1218 # Look for SVT player
1219 svt_url = SVTIE._extract_url(webpage)
1221 return self.url_result(svt_url, 'SVT')
1223 # Look for embedded condenast player
1224 matches = re.findall(
1225 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1229 '_type': 'playlist',
1232 'ie_key': 'CondeNast',
1234 } for ma in matches],
1235 'title': video_title,
1239 # Look for Bandcamp pages with custom domain
1240 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1241 if mobj is not None:
1242 burl = unescapeHTML(mobj.group(1))
1243 # Don't set the extractor because it can be a track url or an album
1244 return self.url_result(burl)
1246 # Look for embedded Vevo player
1248 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1249 if mobj is not None:
1250 return self.url_result(mobj.group('url'))
1252 # Look for embedded Viddler player
1254 r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1256 if mobj is not None:
1257 return self.url_result(mobj.group('url'))
1259 # Look for NYTimes player
1261 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1263 if mobj is not None:
1264 return self.url_result(mobj.group('url'))
1266 # Look for Libsyn player
1268 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1269 if mobj is not None:
1270 return self.url_result(mobj.group('url'))
1272 # Look for Ooyala videos
1273 mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1274 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1275 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1276 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
1277 if mobj is not None:
1278 return OoyalaIE._build_url_result(mobj.group('ec'))
1280 # Look for multiple Ooyala embeds on SBN network websites
1281 mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1282 if mobj is not None:
1283 embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1285 return _playlist_from_matches(
1286 embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1288 # Look for Aparat videos
1289 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1290 if mobj is not None:
1291 return self.url_result(mobj.group(1), 'Aparat')
1293 # Look for MPORA videos
1294 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1295 if mobj is not None:
1296 return self.url_result(mobj.group(1), 'Mpora')
1298 # Look for embedded NovaMov-based player
1300 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1301 (?P<url>http://(?:(?:embed|www)\.)?
1303 nowvideo\.(?:ch|sx|eu|at|ag|co)|
1304 videoweed\.(?:es|com)|
1305 movshare\.(?:net|sx|ag)|
1306 divxstage\.(?:eu|net|ch|co|at|ag))
1307 /embed\.php.+?)\1''', webpage)
1308 if mobj is not None:
1309 return self.url_result(mobj.group('url'))
1311 # Look for embedded Facebook player
1313 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1314 if mobj is not None:
1315 return self.url_result(mobj.group('url'), 'Facebook')
1317 # Look for embedded VK player
1318 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1319 if mobj is not None:
1320 return self.url_result(mobj.group('url'), 'VK')
1322 # Look for embedded ivi player
1323 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1324 if mobj is not None:
1325 return self.url_result(mobj.group('url'), 'Ivi')
1327 # Look for embedded Huffington Post player
1329 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1330 if mobj is not None:
1331 return self.url_result(mobj.group('url'), 'HuffPost')
1334 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1335 if mobj is not None:
1336 return self.url_result(mobj.group('url'))
1337 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1338 if mobj is not None:
1339 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1341 # Look for funnyordie embed
1342 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1344 return _playlist_from_matches(
1345 matches, getter=unescapeHTML, ie='FunnyOrDie')
1347 # Look for BBC iPlayer embed
1348 matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1350 return _playlist_from_matches(matches, ie='BBCCoUk')
1352 # Look for embedded RUTV player
1353 rutv_url = RUTVIE._extract_url(webpage)
1355 return self.url_result(rutv_url, 'RUTV')
1357 # Look for embedded TVC player
1358 tvc_url = TVCIE._extract_url(webpage)
1360 return self.url_result(tvc_url, 'TVC')
1362 # Look for embedded SportBox player
1363 sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
1365 return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
1367 # Look for embedded PornHub player
1368 pornhub_url = PornHubIE._extract_url(webpage)
1370 return self.url_result(pornhub_url, 'PornHub')
1372 # Look for embedded XHamster player
1373 xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
1375 return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
1377 # Look for embedded Tvigle player
1379 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
1380 if mobj is not None:
1381 return self.url_result(mobj.group('url'), 'Tvigle')
1383 # Look for embedded TED player
1385 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1386 if mobj is not None:
1387 return self.url_result(mobj.group('url'), 'TED')
1389 # Look for embedded Ustream videos
1391 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1392 if mobj is not None:
1393 return self.url_result(mobj.group('url'), 'Ustream')
1395 # Look for embedded arte.tv player
1397 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1399 if mobj is not None:
1400 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1402 # Look for embedded smotri.com player
1403 smotri_url = SmotriIE._extract_url(webpage)
1405 return self.url_result(smotri_url, 'Smotri')
1407 # Look for embeded soundcloud player
1409 r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1411 if mobj is not None:
1412 url = unescapeHTML(mobj.group('url'))
1413 return self.url_result(url)
1415 # Look for embedded vulture.com player
1417 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1419 if mobj is not None:
1420 url = unescapeHTML(mobj.group('url'))
1421 return self.url_result(url, ie='Vulture')
1423 # Look for embedded mtvservices player
1425 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1427 if mobj is not None:
1428 url = unescapeHTML(mobj.group('url'))
1429 return self.url_result(url, ie='MTVServicesEmbedded')
1431 # Look for embedded yahoo player
1433 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1435 if mobj is not None:
1436 return self.url_result(mobj.group('url'), 'Yahoo')
1438 # Look for embedded sbs.com.au player
1442 <meta\s+property="og:video"\s+content=|
1445 (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1447 if mobj is not None:
1448 return self.url_result(mobj.group('url'), 'SBS')
1450 # Look for embedded Cinchcast player
1452 r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1454 if mobj is not None:
1455 return self.url_result(mobj.group('url'), 'Cinchcast')
1458 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1462 r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
1464 if mobj is not None:
1465 return self.url_result(mobj.group('url'), 'MLB')
1468 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1470 if mobj is not None:
1471 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1474 r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1476 if mobj is not None:
1477 return self.url_result(mobj.group('url'), 'Livestream')
1479 # Look for Zapiks embed
1481 r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1482 if mobj is not None:
1483 return self.url_result(mobj.group('url'), 'Zapiks')
1485 # Look for Kaltura embeds
1487 r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1488 if mobj is not None:
1489 return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1491 # Look for Eagle.Platform embeds
1493 r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1494 if mobj is not None:
1495 return self.url_result(mobj.group('url'), 'EaglePlatform')
1497 # Look for ClipYou (uses Eagle.Platform) embeds
1499 r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1500 if mobj is not None:
1501 return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1503 # Look for Pladform embeds
1505 r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1506 if mobj is not None:
1507 return self.url_result(mobj.group('url'), 'Pladform')
1509 # Look for Playwire embeds
1511 r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
1512 if mobj is not None:
1513 return self.url_result(mobj.group('url'))
1515 # Look for 5min embeds
1517 r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1518 if mobj is not None:
1519 return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1521 # Look for Crooks and Liars embeds
1523 r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1524 if mobj is not None:
1525 return self.url_result(mobj.group('url'))
1527 # Look for NBC Sports VPlayer embeds
1528 nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1530 return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1532 # Look for UDN embeds
1534 r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1535 if mobj is not None:
1536 return self.url_result(
1537 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
1539 # Look for Senate ISVP iframe
1540 senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
1542 return self.url_result(senate_isvp_url, 'SenateISVP')
1544 # Look for Dailymotion Cloud videos
1545 dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
1547 return self.url_result(dmcloud_url, 'DailymotionCloud')
1549 # Look for OnionStudios embeds
1550 onionstudios_url = OnionStudiosIE._extract_url(webpage)
1551 if onionstudios_url:
1552 return self.url_result(onionstudios_url)
1554 # Look for SnagFilms embeds
1555 snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage)
1557 return self.url_result(snagfilms_url)
1559 # Look for AdobeTVVideo embeds
1561 r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
1563 if mobj is not None:
1564 return self.url_result(
1565 self._proto_relative_url(unescapeHTML(mobj.group(1))),
1568 def check_video(vurl):
1569 if YoutubeIE.suitable(vurl):
1571 vpath = compat_urlparse.urlparse(vurl).path
1572 vext = determine_ext(vpath)
1573 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1575 def filter_video(urls):
1576 return list(filter(check_video, urls))
1578 # Start with something easy: JW Player in SWFObject
1579 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1581 # Look for gorilla-vid style embedding
1582 found = filter_video(re.findall(r'''(?sx)
1586 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1589 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1591 # Broaden the search a little bit
1592 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1594 # Broaden the findall a little bit: JWPlayer JS loader
1595 found = filter_video(re.findall(
1596 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1599 found = filter_video(re.findall(r'''(?xs)
1600 flowplayer\("[^"]+",\s*
1602 \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1603 ["']?url["']?\s*:\s*["']([^"']+)["']
1608 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1610 # Try to find twitter cards info
1611 found = filter_video(re.findall(
1612 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1614 # We look for Open Graph info:
1615 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1616 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1617 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1618 if m_video_type is not None:
1619 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1622 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1624 REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1626 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1627 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1630 # Look also in Refresh HTTP header
1631 refresh_header = head_response.headers.get('Refresh')
1633 found = re.search(REDIRECT_REGEX, refresh_header)
1635 new_url = compat_urlparse.urljoin(url, found.group(1))
1636 self.report_following_redirect(new_url)
1642 raise UnsupportedError(url)
1645 for video_url in found:
1646 video_url = compat_urlparse.urljoin(url, video_url)
1647 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1649 # Sometimes, jwplayer extraction will result in a YouTube URL
1650 if YoutubeIE.suitable(video_url):
1651 entries.append(self.url_result(video_url, 'Youtube'))
1654 # here's a fun little line of code for you:
1655 video_id = os.path.splitext(video_id)[0]
1657 if determine_ext(video_url) == 'smil':
1660 'formats': self._extract_smil_formats(video_url, video_id),
1661 'uploader': video_uploader,
1662 'title': video_title,
1663 'age_limit': age_limit,
1669 'uploader': video_uploader,
1670 'title': video_title,
1671 'age_limit': age_limit,
1674 if len(entries) == 1:
1677 for num, e in enumerate(entries, start=1):
1678 # 'url' results don't have a title
1679 if e.get('title') is not None:
1680 e['title'] = '%s (%d)' % (e['title'], num)
1682 '_type': 'playlist',