youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18
  19 from .utils import *
  20
  21
  22 class InfoExtractor(object):
  23     """Information Extractor class.
  24
  25     Information extractors are the classes that, given a URL, extract
  26     information about the video (or videos) the URL refers to. This
  27     information includes the real video URL, the video title, author and
  28     others. The information is stored in a dictionary which is then
  29     passed to the FileDownloader. The FileDownloader processes this
  30     information possibly downloading the video to the file system, among
  31     other possible outcomes.
  32
  33     The dictionaries must include the following fields:
  34
  35     id:             Video identifier.
  36     url:            Final video URL.
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader:       Full name of the video uploader.
  46     upload_date:    Video upload date (YYYYMMDD).
  47     uploader_id:    Nickname or id of the video uploader.
  48     location:       Physical location of the video.
  49     player_url:     SWF Player URL (used for rtmpdump).
  50     subtitles:      The subtitle file contents.
  51     urlhandle:      [internal] The urlHandle to be used to download the file,
  52                     like returned by urllib.request.urlopen
  53
  54     The fields should all be Unicode strings.
  55
  56     Subclasses of this one should re-define the _real_initialize() and
  57     _real_extract() methods and define a _VALID_URL regexp.
  58     Probably, they should also be added to the list of extractors.
  59
  60     _real_extract() must return a *list* of information dictionaries as
  61     described above.
  62
  63     Finally, the _WORKING attribute should be set to False for broken IEs
  64     in order to warn the users and skip the tests.
  65     """
  66
  67     _ready = False
  68     _downloader = None
  69     _WORKING = True
  70
  71     def __init__(self, downloader=None):
  72         """Constructor. Receives an optional downloader."""
  73         self._ready = False
  74         self.set_downloader(downloader)
  75
  76     def suitable(self, url):
  77         """Receives a URL and returns True if suitable for this IE."""
  78         return re.match(self._VALID_URL, url) is not None
  79
  80     def working(self):
  81         """Getter method for _WORKING."""
  82         return self._WORKING
  83
  84     def initialize(self):
  85         """Initializes an instance (authentication, etc)."""
  86         if not self._ready:
  87             self._real_initialize()
  88             self._ready = True
  89
  90     def extract(self, url):
  91         """Extracts URL information and returns it in list of dicts."""
  92         self.initialize()
  93         return self._real_extract(url)
  94
  95     def set_downloader(self, downloader):
  96         """Sets the downloader for this IE."""
  97         self._downloader = downloader
  98
  99     def _real_initialize(self):
 100         """Real initialization process. Redefine in subclasses."""
 101         pass
 102
 103     def _real_extract(self, url):
 104         """Real extraction process. Redefine in subclasses."""
 105         pass
 106
 107     @property
 108     def IE_NAME(self):
 109         return type(self).__name__[:-2]
 110
 111     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 112         """ Returns the response handle """
 113         if note is None:
 114             note = u'Downloading video webpage'
 115         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 116         try:
 117             return compat_urllib_request.urlopen(url_or_request)
 118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 119             if errnote is None:
 120                 errnote = u'Unable to download webpage'
 121             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 122
 123     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 124         """ Returns the data of the page as a string """
 125         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 126         webpage_bytes = urlh.read()
 127         return webpage_bytes.decode('utf-8', 'replace')
 128
 129
 130 class YoutubeIE(InfoExtractor):
 131     """Information extractor for youtube.com."""
 132
 133     _VALID_URL = r"""^
 134                      (
 135                          (?:https?://)?                                       # http(s):// (optional)
 136                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 137                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 138                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 139                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 140                          (?:                                                  # the various things that can precede the ID:
 141                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 142                              |(?:                                             # or the v= param in all its forms
 143                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 144                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 145                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 146                                  v=
 147                              )
 148                          )?                                                   # optional -> youtube.com/xxxx is OK
 149                      )?                                                       # all until now is optional -> you can pass the naked ID
 150                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 151                      (?(1).+)?                                                # if we found the ID, everything can follow
 152                      $"""
 153     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 154     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 155     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 156     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 157     _NETRC_MACHINE = 'youtube'
 158     # Listed in order of quality
 159     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 160     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 161     _video_extensions = {
 162         '13': '3gp',
 163         '17': 'mp4',
 164         '18': 'mp4',
 165         '22': 'mp4',
 166         '37': 'mp4',
 167         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 168         '43': 'webm',
 169         '44': 'webm',
 170         '45': 'webm',
 171         '46': 'webm',
 172     }
 173     _video_dimensions = {
 174         '5': '240x400',
 175         '6': '???',
 176         '13': '???',
 177         '17': '144x176',
 178         '18': '360x640',
 179         '22': '720x1280',
 180         '34': '360x640',
 181         '35': '480x854',
 182         '37': '1080x1920',
 183         '38': '3072x4096',
 184         '43': '360x640',
 185         '44': '480x854',
 186         '45': '720x1280',
 187         '46': '1080x1920',
 188     }
 189     IE_NAME = u'youtube'
 190
 191     def suitable(self, url):
 192         """Receives a URL and returns True if suitable for this IE."""
 193         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 194
 195     def report_lang(self):
 196         """Report attempt to set language."""
 197         self._downloader.to_screen(u'[youtube] Setting language')
 198
 199     def report_login(self):
 200         """Report attempt to log in."""
 201         self._downloader.to_screen(u'[youtube] Logging in')
 202
 203     def report_age_confirmation(self):
 204         """Report attempt to confirm age."""
 205         self._downloader.to_screen(u'[youtube] Confirming age')
 206
 207     def report_video_webpage_download(self, video_id):
 208         """Report attempt to download video webpage."""
 209         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 210
 211     def report_video_info_webpage_download(self, video_id):
 212         """Report attempt to download video info webpage."""
 213         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 214
 215     def report_video_subtitles_download(self, video_id):
 216         """Report attempt to download video info webpage."""
 217         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 218
 219     def report_video_subtitles_request(self, video_id, lang):
 220         """Report attempt to download video info webpage."""
 221         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for lang: %s' % (video_id,lang))
 222
 223     def report_information_extraction(self, video_id):
 224         """Report attempt to extract video information."""
 225         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 226
 227     def report_unavailable_format(self, video_id, format):
 228         """Report extracted video URL."""
 229         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 230
 231     def report_rtmp_download(self):
 232         """Indicate the download will use the RTMP protocol."""
 233         self._downloader.to_screen(u'[youtube] RTMP download detected')
 234
 235     def _get_available_subtitles(self, video_id):
 236         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 237         try:
 238             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 239         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 240             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 241         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 242         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 243         if not sub_lang_list:
 244             return (u'WARNING: video has no closed captions', None)
 245         return sub_lang_list
 246
 247     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 248         self.report_video_subtitles_request(video_id, sub_lang)
 249         params = compat_urllib_parse.urlencode({
 250             'lang': sub_lang,
 251             'name': sub_name,
 252             'v': video_id,
 253             'fmt': format,
 254         })
 255         url = 'http://www.youtube.com/api/timedtext?' + params
 256         try:
 257             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 258         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 259             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 260         if not sub:
 261             return (u'WARNING: Did not fetch video subtitles', None)
 262         return (None, sub_lang, sub)
 263
 264     def _extract_subtitle(self, video_id):
 265         self.report_video_subtitles_download(video_id)
 266         sub_lang_list = self._get_available_subtitles(video_id)
 267         sub_format = self._downloader.params.get('subtitlesformat')
 268         if self._downloader.params.get('subtitleslang', False):
 269             sub_lang = self._downloader.params.get('subtitleslang')
 270         elif 'en' in sub_lang_list:
 271             sub_lang = 'en'
 272         else:
 273             sub_lang = list(sub_lang_list.keys())[0]
 274         if not sub_lang in sub_lang_list:
 275             return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None)
 276
 277         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 278         return [subtitle]
 279
 280     def _extract_all_subtitles(self, video_id):
 281         self.report_video_subtitles_download(video_id)
 282         sub_lang_list = self._get_available_subtitles(video_id)
 283         sub_format = self._downloader.params.get('subtitlesformat')
 284         subtitles = []
 285         for sub_lang in sub_lang_list:
 286             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 287             subtitles.append(subtitle)
 288         return subtitles
 289
 290     def _print_formats(self, formats):
 291         print('Available formats:')
 292         for x in formats:
 293             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 294
 295     def _real_initialize(self):
 296         if self._downloader is None:
 297             return
 298
 299         username = None
 300         password = None
 301         downloader_params = self._downloader.params
 302
 303         # Attempt to use provided username and password or .netrc data
 304         if downloader_params.get('username', None) is not None:
 305             username = downloader_params['username']
 306             password = downloader_params['password']
 307         elif downloader_params.get('usenetrc', False):
 308             try:
 309                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 310                 if info is not None:
 311                     username = info[0]
 312                     password = info[2]
 313                 else:
 314                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 315             except (IOError, netrc.NetrcParseError) as err:
 316                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 317                 return
 318
 319         # Set language
 320         request = compat_urllib_request.Request(self._LANG_URL)
 321         try:
 322             self.report_lang()
 323             compat_urllib_request.urlopen(request).read()
 324         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 325             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 326             return
 327
 328         # No authentication to be performed
 329         if username is None:
 330             return
 331
 332         request = compat_urllib_request.Request(self._LOGIN_URL)
 333         try:
 334             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 335         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 336             self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
 337             return
 338
 339         galx = None
 340         dsh = None
 341         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 342         if match:
 343           galx = match.group(1)
 344
 345         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 346         if match:
 347           dsh = match.group(1)
 348
 349         # Log in
 350         login_form_strs = {
 351                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 352                 u'Email': username,
 353                 u'GALX': galx,
 354                 u'Passwd': password,
 355                 u'PersistentCookie': u'yes',
 356                 u'_utf8': u'霱',
 357                 u'bgresponse': u'js_disabled',
 358                 u'checkConnection': u'',
 359                 u'checkedDomains': u'youtube',
 360                 u'dnConn': u'',
 361                 u'dsh': dsh,
 362                 u'pstMsg': u'0',
 363                 u'rmShown': u'1',
 364                 u'secTok': u'',
 365                 u'signIn': u'Sign in',
 366                 u'timeStmp': u'',
 367                 u'service': u'youtube',
 368                 u'uilel': u'3',
 369                 u'hl': u'en_US',
 370         }
 371         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 372         # chokes on unicode
 373         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 374         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 375         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 376         try:
 377             self.report_login()
 378             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 379             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 380                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 381                 return
 382         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 383             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 384             return
 385
 386         # Confirm age
 387         age_form = {
 388                 'next_url':     '/',
 389                 'action_confirm':   'Confirm',
 390                 }
 391         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 392         try:
 393             self.report_age_confirmation()
 394             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 395         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 396             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 397             return
 398
 399     def _extract_id(self, url):
 400         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 401         if mobj is None:
 402             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 403             return
 404         video_id = mobj.group(2)
 405         return video_id
 406
 407     def _real_extract(self, url):
 408         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 409         mobj = re.search(self._NEXT_URL_RE, url)
 410         if mobj:
 411             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 412         video_id = self._extract_id(url)
 413
 414         # Get video webpage
 415         self.report_video_webpage_download(video_id)
 416         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 417         request = compat_urllib_request.Request(url)
 418         try:
 419             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 420         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 421             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 422             return
 423
 424         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 425
 426         # Attempt to extract SWF player URL
 427         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 428         if mobj is not None:
 429             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 430         else:
 431             player_url = None
 432
 433         # Get video info
 434         self.report_video_info_webpage_download(video_id)
 435         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 436             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 437                     % (video_id, el_type))
 438             request = compat_urllib_request.Request(video_info_url)
 439             try:
 440                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 441                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 442                 video_info = compat_parse_qs(video_info_webpage)
 443                 if 'token' in video_info:
 444                     break
 445             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 446                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 447                 return
 448         if 'token' not in video_info:
 449             if 'reason' in video_info:
 450                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 451             else:
 452                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 453             return
 454
 455         # Check for "rental" videos
 456         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 457             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 458             return
 459
 460         # Start extracting information
 461         self.report_information_extraction(video_id)
 462
 463         # uploader
 464         if 'author' not in video_info:
 465             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 466             return
 467         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 468
 469         # uploader_id
 470         video_uploader_id = None
 471         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 472         if mobj is not None:
 473             video_uploader_id = mobj.group(1)
 474         else:
 475             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 476
 477         # title
 478         if 'title' not in video_info:
 479             self._downloader.trouble(u'ERROR: unable to extract video title')
 480             return
 481         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 482
 483         # thumbnail image
 484         if 'thumbnail_url' not in video_info:
 485             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 486             video_thumbnail = ''
 487         else:   # don't panic if we can't find it
 488             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 489
 490         # upload date
 491         upload_date = None
 492         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 493         if mobj is not None:
 494             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 495             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 496             for expression in format_expressions:
 497                 try:
 498                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 499                 except:
 500                     pass
 501
 502         # description
 503         video_description = get_element_by_id("eow-description", video_webpage)
 504         if video_description:
 505             video_description = clean_html(video_description)
 506         else:
 507             video_description = ''
 508
 509         # subtitles
 510         video_subtitles = None
 511
 512         if self._downloader.params.get('writesubtitles', False):
 513             video_subtitles = self._extract_subtitle(video_id)
 514             if video_subtitles:
 515                 (sub_error, sub_lang, sub) = video_subtitles[0]
 516                 if sub_error:
 517                     self._downloader.trouble(sub_error)
 518
 519         if self._downloader.params.get('allsubtitles', False):
 520             video_subtitles = self._extract_all_subtitles(video_id)
 521             for video_subtitle in video_subtitles:
 522                 (sub_error, sub_lang, sub) = video_subtitle
 523                 if sub_error:
 524                     self._downloader.trouble(sub_error)
 525
 526         if 'length_seconds' not in video_info:
 527             self._downloader.trouble(u'WARNING: unable to extract video duration')
 528             video_duration = ''
 529         else:
 530             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 531
 532         # token
 533         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 534
 535         # Decide which formats to download
 536         req_format = self._downloader.params.get('format', None)
 537
 538         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 539             self.report_rtmp_download()
 540             video_url_list = [(None, video_info['conn'][0])]
 541         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 542             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 543             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 544             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 545             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 546
 547             format_limit = self._downloader.params.get('format_limit', None)
 548             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 549             if format_limit is not None and format_limit in available_formats:
 550                 format_list = available_formats[available_formats.index(format_limit):]
 551             else:
 552                 format_list = available_formats
 553             existing_formats = [x for x in format_list if x in url_map]
 554             if len(existing_formats) == 0:
 555                 self._downloader.trouble(u'ERROR: no known formats available for video')
 556                 return
 557             if self._downloader.params.get('listformats', None):
 558                 self._print_formats(existing_formats)
 559                 return
 560             if req_format is None or req_format == 'best':
 561                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 562             elif req_format == 'worst':
 563                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 564             elif req_format in ('-1', 'all'):
 565                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 566             else:
 567                 # Specific formats. We pick the first in a slash-delimeted sequence.
 568                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 569                 req_formats = req_format.split('/')
 570                 video_url_list = None
 571                 for rf in req_formats:
 572                     if rf in url_map:
 573                         video_url_list = [(rf, url_map[rf])]
 574                         break
 575                 if video_url_list is None:
 576                     self._downloader.trouble(u'ERROR: requested format not available')
 577                     return
 578         else:
 579             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 580             return
 581
 582         results = []
 583         for format_param, video_real_url in video_url_list:
 584             # Extension
 585             video_extension = self._video_extensions.get(format_param, 'flv')
 586
 587             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 588                                               self._video_dimensions.get(format_param, '???'))
 589
 590             results.append({
 591                 'id':       video_id,
 592                 'url':      video_real_url,
 593                 'uploader': video_uploader,
 594                 'uploader_id': video_uploader_id,
 595                 'upload_date':  upload_date,
 596                 'title':    video_title,
 597                 'ext':      video_extension,
 598                 'format':   video_format,
 599                 'thumbnail':    video_thumbnail,
 600                 'description':  video_description,
 601                 'player_url':   player_url,
 602                 'subtitles':    video_subtitles,
 603                 'duration':     video_duration
 604             })
 605         return results
 606
 607
 608 class MetacafeIE(InfoExtractor):
 609     """Information Extractor for metacafe.com."""
 610
 611     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 612     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 613     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 614     IE_NAME = u'metacafe'
 615
 616     def __init__(self, downloader=None):
 617         InfoExtractor.__init__(self, downloader)
 618
 619     def report_disclaimer(self):
 620         """Report disclaimer retrieval."""
 621         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 622
 623     def report_age_confirmation(self):
 624         """Report attempt to confirm age."""
 625         self._downloader.to_screen(u'[metacafe] Confirming age')
 626
 627     def report_download_webpage(self, video_id):
 628         """Report webpage download."""
 629         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 630
 631     def report_extraction(self, video_id):
 632         """Report information extraction."""
 633         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 634
 635     def _real_initialize(self):
 636         # Retrieve disclaimer
 637         request = compat_urllib_request.Request(self._DISCLAIMER)
 638         try:
 639             self.report_disclaimer()
 640             disclaimer = compat_urllib_request.urlopen(request).read()
 641         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 642             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 643             return
 644
 645         # Confirm age
 646         disclaimer_form = {
 647             'filters': '0',
 648             'submit': "Continue - I'm over 18",
 649             }
 650         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 651         try:
 652             self.report_age_confirmation()
 653             disclaimer = compat_urllib_request.urlopen(request).read()
 654         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 655             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 656             return
 657
 658     def _real_extract(self, url):
 659         # Extract id and simplified title from URL
 660         mobj = re.match(self._VALID_URL, url)
 661         if mobj is None:
 662             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 663             return
 664
 665         video_id = mobj.group(1)
 666
 667         # Check if video comes from YouTube
 668         mobj2 = re.match(r'^yt-(.*)$', video_id)
 669         if mobj2 is not None:
 670             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 671             return
 672
 673         # Retrieve video webpage to extract further information
 674         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 675         try:
 676             self.report_download_webpage(video_id)
 677             webpage = compat_urllib_request.urlopen(request).read()
 678         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 679             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 680             return
 681
 682         # Extract URL, uploader and title from webpage
 683         self.report_extraction(video_id)
 684         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 685         if mobj is not None:
 686             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 687             video_extension = mediaURL[-3:]
 688
 689             # Extract gdaKey if available
 690             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 691             if mobj is None:
 692                 video_url = mediaURL
 693             else:
 694                 gdaKey = mobj.group(1)
 695                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 696         else:
 697             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 698             if mobj is None:
 699                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 700                 return
 701             vardict = compat_parse_qs(mobj.group(1))
 702             if 'mediaData' not in vardict:
 703                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 704                 return
 705             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 706             if mobj is None:
 707                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 708                 return
 709             mediaURL = mobj.group(1).replace('\\/', '/')
 710             video_extension = mediaURL[-3:]
 711             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 712
 713         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 714         if mobj is None:
 715             self._downloader.trouble(u'ERROR: unable to extract title')
 716             return
 717         video_title = mobj.group(1).decode('utf-8')
 718
 719         mobj = re.search(r'submitter=(.*?);', webpage)
 720         if mobj is None:
 721             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 722             return
 723         video_uploader = mobj.group(1)
 724
 725         return [{
 726             'id':       video_id.decode('utf-8'),
 727             'url':      video_url.decode('utf-8'),
 728             'uploader': video_uploader.decode('utf-8'),
 729             'upload_date':  None,
 730             'title':    video_title,
 731             'ext':      video_extension.decode('utf-8'),
 732         }]
 733
 734
 735 class DailymotionIE(InfoExtractor):
 736     """Information Extractor for Dailymotion"""
 737
 738     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 739     IE_NAME = u'dailymotion'
 740     _WORKING = False
 741
 742     def __init__(self, downloader=None):
 743         InfoExtractor.__init__(self, downloader)
 744
 745     def report_extraction(self, video_id):
 746         """Report information extraction."""
 747         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 748
 749     def _real_extract(self, url):
 750         # Extract id and simplified title from URL
 751         mobj = re.match(self._VALID_URL, url)
 752         if mobj is None:
 753             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 754             return
 755
 756         video_id = mobj.group(1).split('_')[0].split('?')[0]
 757
 758         video_extension = 'mp4'
 759
 760         # Retrieve video webpage to extract further information
 761         request = compat_urllib_request.Request(url)
 762         request.add_header('Cookie', 'family_filter=off')
 763         webpage = self._download_webpage(request, video_id)
 764
 765         # Extract URL, uploader and title from webpage
 766         self.report_extraction(video_id)
 767         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 768         if mobj is None:
 769             self._downloader.trouble(u'ERROR: unable to extract media URL')
 770             return
 771         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 772
 773         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 774             if key in flashvars:
 775                 max_quality = key
 776                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 777                 break
 778         else:
 779             self._downloader.trouble(u'ERROR: unable to extract video URL')
 780             return
 781
 782         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 783         if mobj is None:
 784             self._downloader.trouble(u'ERROR: unable to extract video URL')
 785             return
 786
 787         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 788
 789         # TODO: support choosing qualities
 790
 791         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 792         if mobj is None:
 793             self._downloader.trouble(u'ERROR: unable to extract title')
 794             return
 795         video_title = unescapeHTML(mobj.group('title'))
 796
 797         video_uploader = None
 798         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 799         if mobj is None:
 800             # lookin for official user
 801             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 802             if mobj_official is None:
 803                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 804             else:
 805                 video_uploader = mobj_official.group(1)
 806         else:
 807             video_uploader = mobj.group(1)
 808
 809         video_upload_date = None
 810         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 811         if mobj is not None:
 812             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 813
 814         return [{
 815             'id':       video_id,
 816             'url':      video_url,
 817             'uploader': video_uploader,
 818             'upload_date':  video_upload_date,
 819             'title':    video_title,
 820             'ext':      video_extension,
 821         }]
 822
 823
 824 class PhotobucketIE(InfoExtractor):
 825     """Information extractor for photobucket.com."""
 826
 827     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 828     IE_NAME = u'photobucket'
 829
 830     def __init__(self, downloader=None):
 831         InfoExtractor.__init__(self, downloader)
 832
 833     def report_download_webpage(self, video_id):
 834         """Report webpage download."""
 835         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 836
 837     def report_extraction(self, video_id):
 838         """Report information extraction."""
 839         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 840
 841     def _real_extract(self, url):
 842         # Extract id from URL
 843         mobj = re.match(self._VALID_URL, url)
 844         if mobj is None:
 845             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 846             return
 847
 848         video_id = mobj.group(1)
 849
 850         video_extension = 'flv'
 851
 852         # Retrieve video webpage to extract further information
 853         request = compat_urllib_request.Request(url)
 854         try:
 855             self.report_download_webpage(video_id)
 856             webpage = compat_urllib_request.urlopen(request).read()
 857         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 858             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 859             return
 860
 861         # Extract URL, uploader, and title from webpage
 862         self.report_extraction(video_id)
 863         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 864         if mobj is None:
 865             self._downloader.trouble(u'ERROR: unable to extract media URL')
 866             return
 867         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 868
 869         video_url = mediaURL
 870
 871         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 872         if mobj is None:
 873             self._downloader.trouble(u'ERROR: unable to extract title')
 874             return
 875         video_title = mobj.group(1).decode('utf-8')
 876
 877         video_uploader = mobj.group(2).decode('utf-8')
 878
 879         return [{
 880             'id':       video_id.decode('utf-8'),
 881             'url':      video_url.decode('utf-8'),
 882             'uploader': video_uploader,
 883             'upload_date':  None,
 884             'title':    video_title,
 885             'ext':      video_extension.decode('utf-8'),
 886         }]
 887
 888
 889 class YahooIE(InfoExtractor):
 890     """Information extractor for video.yahoo.com."""
 891
 892     _WORKING = False
 893     # _VALID_URL matches all Yahoo! Video URLs
 894     # _VPAGE_URL matches only the extractable '/watch/' URLs
 895     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 896     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 897     IE_NAME = u'video.yahoo'
 898
 899     def __init__(self, downloader=None):
 900         InfoExtractor.__init__(self, downloader)
 901
 902     def report_download_webpage(self, video_id):
 903         """Report webpage download."""
 904         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 905
 906     def report_extraction(self, video_id):
 907         """Report information extraction."""
 908         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 909
 910     def _real_extract(self, url, new_video=True):
 911         # Extract ID from URL
 912         mobj = re.match(self._VALID_URL, url)
 913         if mobj is None:
 914             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 915             return
 916
 917         video_id = mobj.group(2)
 918         video_extension = 'flv'
 919
 920         # Rewrite valid but non-extractable URLs as
 921         # extractable English language /watch/ URLs
 922         if re.match(self._VPAGE_URL, url) is None:
 923             request = compat_urllib_request.Request(url)
 924             try:
 925                 webpage = compat_urllib_request.urlopen(request).read()
 926             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 927                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 928                 return
 929
 930             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 931             if mobj is None:
 932                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 933                 return
 934             yahoo_id = mobj.group(1)
 935
 936             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 937             if mobj is None:
 938                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 939                 return
 940             yahoo_vid = mobj.group(1)
 941
 942             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 943             return self._real_extract(url, new_video=False)
 944
 945         # Retrieve video webpage to extract further information
 946         request = compat_urllib_request.Request(url)
 947         try:
 948             self.report_download_webpage(video_id)
 949             webpage = compat_urllib_request.urlopen(request).read()
 950         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 951             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 952             return
 953
 954         # Extract uploader and title from webpage
 955         self.report_extraction(video_id)
 956         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 957         if mobj is None:
 958             self._downloader.trouble(u'ERROR: unable to extract video title')
 959             return
 960         video_title = mobj.group(1).decode('utf-8')
 961
 962         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 963         if mobj is None:
 964             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 965             return
 966         video_uploader = mobj.group(1).decode('utf-8')
 967
 968         # Extract video thumbnail
 969         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 970         if mobj is None:
 971             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 972             return
 973         video_thumbnail = mobj.group(1).decode('utf-8')
 974
 975         # Extract video description
 976         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 977         if mobj is None:
 978             self._downloader.trouble(u'ERROR: unable to extract video description')
 979             return
 980         video_description = mobj.group(1).decode('utf-8')
 981         if not video_description:
 982             video_description = 'No description available.'
 983
 984         # Extract video height and width
 985         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 986         if mobj is None:
 987             self._downloader.trouble(u'ERROR: unable to extract video height')
 988             return
 989         yv_video_height = mobj.group(1)
 990
 991         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 992         if mobj is None:
 993             self._downloader.trouble(u'ERROR: unable to extract video width')
 994             return
 995         yv_video_width = mobj.group(1)
 996
 997         # Retrieve video playlist to extract media URL
 998         # I'm not completely sure what all these options are, but we
 999         # seem to need most of them, otherwise the server sends a 401.
1000         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1001         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1002         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1003                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1004                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1005         try:
1006             self.report_download_webpage(video_id)
1007             webpage = compat_urllib_request.urlopen(request).read()
1008         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1009             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1010             return
1011
1012         # Extract media URL from playlist XML
1013         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1014         if mobj is None:
1015             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1016             return
1017         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1018         video_url = unescapeHTML(video_url)
1019
1020         return [{
1021             'id':       video_id.decode('utf-8'),
1022             'url':      video_url,
1023             'uploader': video_uploader,
1024             'upload_date':  None,
1025             'title':    video_title,
1026             'ext':      video_extension.decode('utf-8'),
1027             'thumbnail':    video_thumbnail.decode('utf-8'),
1028             'description':  video_description,
1029         }]
1030
1031
1032 class VimeoIE(InfoExtractor):
1033     """Information extractor for vimeo.com."""
1034
1035     # _VALID_URL matches Vimeo URLs
1036     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1037     IE_NAME = u'vimeo'
1038
1039     def __init__(self, downloader=None):
1040         InfoExtractor.__init__(self, downloader)
1041
1042     def report_download_webpage(self, video_id):
1043         """Report webpage download."""
1044         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1045
1046     def report_extraction(self, video_id):
1047         """Report information extraction."""
1048         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1049
1050     def _real_extract(self, url, new_video=True):
1051         # Extract ID from URL
1052         mobj = re.match(self._VALID_URL, url)
1053         if mobj is None:
1054             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1055             return
1056
1057         video_id = mobj.group('id')
1058         if not mobj.group('proto'):
1059             url = 'https://' + url
1060         if mobj.group('direct_link'):
1061             url = 'https://vimeo.com/' + video_id
1062
1063         # Retrieve video webpage to extract further information
1064         request = compat_urllib_request.Request(url, None, std_headers)
1065         try:
1066             self.report_download_webpage(video_id)
1067             webpage_bytes = compat_urllib_request.urlopen(request).read()
1068             webpage = webpage_bytes.decode('utf-8')
1069         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1070             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1071             return
1072
1073         # Now we begin extracting as much information as we can from what we
1074         # retrieved. First we extract the information common to all extractors,
1075         # and latter we extract those that are Vimeo specific.
1076         self.report_extraction(video_id)
1077
1078         # Extract the config JSON
1079         try:
1080             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1081             config = json.loads(config)
1082         except:
1083             self._downloader.trouble(u'ERROR: unable to extract info section')
1084             return
1085
1086         # Extract title
1087         video_title = config["video"]["title"]
1088
1089         # Extract uploader and uploader_id
1090         video_uploader = config["video"]["owner"]["name"]
1091         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1092
1093         # Extract video thumbnail
1094         video_thumbnail = config["video"]["thumbnail"]
1095
1096         # Extract video description
1097         video_description = get_element_by_attribute("itemprop", "description", webpage)
1098         if video_description: video_description = clean_html(video_description)
1099         else: video_description = ''
1100
1101         # Extract upload date
1102         video_upload_date = None
1103         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1104         if mobj is not None:
1105             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1106
1107         # Vimeo specific: extract request signature and timestamp
1108         sig = config['request']['signature']
1109         timestamp = config['request']['timestamp']
1110
1111         # Vimeo specific: extract video codec and quality information
1112         # First consider quality, then codecs, then take everything
1113         # TODO bind to format param
1114         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1115         files = { 'hd': [], 'sd': [], 'other': []}
1116         for codec_name, codec_extension in codecs:
1117             if codec_name in config["video"]["files"]:
1118                 if 'hd' in config["video"]["files"][codec_name]:
1119                     files['hd'].append((codec_name, codec_extension, 'hd'))
1120                 elif 'sd' in config["video"]["files"][codec_name]:
1121                     files['sd'].append((codec_name, codec_extension, 'sd'))
1122                 else:
1123                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1124
1125         for quality in ('hd', 'sd', 'other'):
1126             if len(files[quality]) > 0:
1127                 video_quality = files[quality][0][2]
1128                 video_codec = files[quality][0][0]
1129                 video_extension = files[quality][0][1]
1130                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1131                 break
1132         else:
1133             self._downloader.trouble(u'ERROR: no known codec found')
1134             return
1135
1136         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1137                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1138
1139         return [{
1140             'id':       video_id,
1141             'url':      video_url,
1142             'uploader': video_uploader,
1143             'uploader_id': video_uploader_id,
1144             'upload_date':  video_upload_date,
1145             'title':    video_title,
1146             'ext':      video_extension,
1147             'thumbnail':    video_thumbnail,
1148             'description':  video_description,
1149         }]
1150
1151
1152 class ArteTvIE(InfoExtractor):
1153     """arte.tv information extractor."""
1154
1155     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1156     _LIVE_URL = r'index-[0-9]+\.html$'
1157
1158     IE_NAME = u'arte.tv'
1159
1160     def __init__(self, downloader=None):
1161         InfoExtractor.__init__(self, downloader)
1162
1163     def report_download_webpage(self, video_id):
1164         """Report webpage download."""
1165         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1166
1167     def report_extraction(self, video_id):
1168         """Report information extraction."""
1169         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1170
1171     def fetch_webpage(self, url):
1172         request = compat_urllib_request.Request(url)
1173         try:
1174             self.report_download_webpage(url)
1175             webpage = compat_urllib_request.urlopen(request).read()
1176         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1177             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1178             return
1179         except ValueError as err:
1180             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1181             return
1182         return webpage
1183
1184     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1185         page = self.fetch_webpage(url)
1186         mobj = re.search(regex, page, regexFlags)
1187         info = {}
1188
1189         if mobj is None:
1190             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1191             return
1192
1193         for (i, key, err) in matchTuples:
1194             if mobj.group(i) is None:
1195                 self._downloader.trouble(err)
1196                 return
1197             else:
1198                 info[key] = mobj.group(i)
1199
1200         return info
1201
1202     def extractLiveStream(self, url):
1203         video_lang = url.split('/')[-4]
1204         info = self.grep_webpage(
1205             url,
1206             r'src="(.*?/videothek_js.*?\.js)',
1207             0,
1208             [
1209                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1210             ]
1211         )
1212         http_host = url.split('/')[2]
1213         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1214         info = self.grep_webpage(
1215             next_url,
1216             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1217                 '(http://.*?\.swf).*?' +
1218                 '(rtmp://.*?)\'',
1219             re.DOTALL,
1220             [
1221                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1222                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1223                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1224             ]
1225         )
1226         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1227
1228     def extractPlus7Stream(self, url):
1229         video_lang = url.split('/')[-3]
1230         info = self.grep_webpage(
1231             url,
1232             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1233             0,
1234             [
1235                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1236             ]
1237         )
1238         next_url = compat_urllib_parse.unquote(info.get('url'))
1239         info = self.grep_webpage(
1240             next_url,
1241             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1242             0,
1243             [
1244                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1245             ]
1246         )
1247         next_url = compat_urllib_parse.unquote(info.get('url'))
1248
1249         info = self.grep_webpage(
1250             next_url,
1251             r'<video id="(.*?)".*?>.*?' +
1252                 '<name>(.*?)</name>.*?' +
1253                 '<dateVideo>(.*?)</dateVideo>.*?' +
1254                 '<url quality="hd">(.*?)</url>',
1255             re.DOTALL,
1256             [
1257                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1258                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1259                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1260                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1261             ]
1262         )
1263
1264         return {
1265             'id':           info.get('id'),
1266             'url':          compat_urllib_parse.unquote(info.get('url')),
1267             'uploader':     u'arte.tv',
1268             'upload_date':  info.get('date'),
1269             'title':        info.get('title').decode('utf-8'),
1270             'ext':          u'mp4',
1271             'format':       u'NA',
1272             'player_url':   None,
1273         }
1274
1275     def _real_extract(self, url):
1276         video_id = url.split('/')[-1]
1277         self.report_extraction(video_id)
1278
1279         if re.search(self._LIVE_URL, video_id) is not None:
1280             self.extractLiveStream(url)
1281             return
1282         else:
1283             info = self.extractPlus7Stream(url)
1284
1285         return [info]
1286
1287
1288 class GenericIE(InfoExtractor):
1289     """Generic last-resort information extractor."""
1290
1291     _VALID_URL = r'.*'
1292     IE_NAME = u'generic'
1293
1294     def __init__(self, downloader=None):
1295         InfoExtractor.__init__(self, downloader)
1296
1297     def report_download_webpage(self, video_id):
1298         """Report webpage download."""
1299         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1300         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1301
1302     def report_extraction(self, video_id):
1303         """Report information extraction."""
1304         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1305
1306     def report_following_redirect(self, new_url):
1307         """Report information extraction."""
1308         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1309
1310     def _test_redirect(self, url):
1311         """Check if it is a redirect, like url shorteners, in case restart chain."""
1312         class HeadRequest(compat_urllib_request.Request):
1313             def get_method(self):
1314                 return "HEAD"
1315
1316         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1317             """
1318             Subclass the HTTPRedirectHandler to make it use our
1319             HeadRequest also on the redirected URL
1320             """
1321             def redirect_request(self, req, fp, code, msg, headers, newurl):
1322                 if code in (301, 302, 303, 307):
1323                     newurl = newurl.replace(' ', '%20')
1324                     newheaders = dict((k,v) for k,v in req.headers.items()
1325                                       if k.lower() not in ("content-length", "content-type"))
1326                     return HeadRequest(newurl,
1327                                        headers=newheaders,
1328                                        origin_req_host=req.get_origin_req_host(),
1329                                        unverifiable=True)
1330                 else:
1331                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1332
1333         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1334             """
1335             Fallback to GET if HEAD is not allowed (405 HTTP error)
1336             """
1337             def http_error_405(self, req, fp, code, msg, headers):
1338                 fp.read()
1339                 fp.close()
1340
1341                 newheaders = dict((k,v) for k,v in req.headers.items()
1342                                   if k.lower() not in ("content-length", "content-type"))
1343                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1344                                                  headers=newheaders,
1345                                                  origin_req_host=req.get_origin_req_host(),
1346                                                  unverifiable=True))
1347
1348         # Build our opener
1349         opener = compat_urllib_request.OpenerDirector()
1350         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1351                         HTTPMethodFallback, HEADRedirectHandler,
1352                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1353             opener.add_handler(handler())
1354
1355         response = opener.open(HeadRequest(url))
1356         new_url = response.geturl()
1357
1358         if url == new_url:
1359             return False
1360
1361         self.report_following_redirect(new_url)
1362         self._downloader.download([new_url])
1363         return True
1364
1365     def _real_extract(self, url):
1366         if self._test_redirect(url): return
1367
1368         video_id = url.split('/')[-1]
1369         request = compat_urllib_request.Request(url)
1370         try:
1371             self.report_download_webpage(video_id)
1372             webpage = compat_urllib_request.urlopen(request).read()
1373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1374             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1375             return
1376         except ValueError as err:
1377             # since this is the last-resort InfoExtractor, if
1378             # this error is thrown, it'll be thrown here
1379             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1380             return
1381
1382         self.report_extraction(video_id)
1383         # Start with something easy: JW Player in SWFObject
1384         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1385         if mobj is None:
1386             # Broaden the search a little bit
1387             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1388         if mobj is None:
1389             # Broaden the search a little bit: JWPlayer JS loader
1390             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1391         if mobj is None:
1392             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1393             return
1394
1395         # It's possible that one of the regexes
1396         # matched, but returned an empty group:
1397         if mobj.group(1) is None:
1398             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1399             return
1400
1401         video_url = compat_urllib_parse.unquote(mobj.group(1))
1402         video_id = os.path.basename(video_url)
1403
1404         # here's a fun little line of code for you:
1405         video_extension = os.path.splitext(video_id)[1][1:]
1406         video_id = os.path.splitext(video_id)[0]
1407
1408         # it's tempting to parse this further, but you would
1409         # have to take into account all the variations like
1410         #   Video Title - Site Name
1411         #   Site Name | Video Title
1412         #   Video Title - Tagline | Site Name
1413         # and so on and so forth; it's just not practical
1414         mobj = re.search(r'<title>(.*)</title>', webpage)
1415         if mobj is None:
1416             self._downloader.trouble(u'ERROR: unable to extract title')
1417             return
1418         video_title = mobj.group(1)
1419
1420         # video uploader is domain name
1421         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1422         if mobj is None:
1423             self._downloader.trouble(u'ERROR: unable to extract title')
1424             return
1425         video_uploader = mobj.group(1)
1426
1427         return [{
1428             'id':       video_id,
1429             'url':      video_url,
1430             'uploader': video_uploader,
1431             'upload_date':  None,
1432             'title':    video_title,
1433             'ext':      video_extension,
1434         }]
1435
1436
1437 class YoutubeSearchIE(InfoExtractor):
1438     """Information Extractor for YouTube search queries."""
1439     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1440     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1441     _max_youtube_results = 1000
1442     IE_NAME = u'youtube:search'
1443
1444     def __init__(self, downloader=None):
1445         InfoExtractor.__init__(self, downloader)
1446
1447     def report_download_page(self, query, pagenum):
1448         """Report attempt to download search page with given number."""
1449         query = query.decode(preferredencoding())
1450         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1451
1452     def _real_extract(self, query):
1453         mobj = re.match(self._VALID_URL, query)
1454         if mobj is None:
1455             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1456             return
1457
1458         prefix, query = query.split(':')
1459         prefix = prefix[8:]
1460         query = query.encode('utf-8')
1461         if prefix == '':
1462             self._download_n_results(query, 1)
1463             return
1464         elif prefix == 'all':
1465             self._download_n_results(query, self._max_youtube_results)
1466             return
1467         else:
1468             try:
1469                 n = int(prefix)
1470                 if n <= 0:
1471                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1472                     return
1473                 elif n > self._max_youtube_results:
1474                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1475                     n = self._max_youtube_results
1476                 self._download_n_results(query, n)
1477                 return
1478             except ValueError: # parsing prefix as integer fails
1479                 self._download_n_results(query, 1)
1480                 return
1481
1482     def _download_n_results(self, query, n):
1483         """Downloads a specified number of results for a query"""
1484
1485         video_ids = []
1486         pagenum = 0
1487         limit = n
1488
1489         while (50 * pagenum) < limit:
1490             self.report_download_page(query, pagenum+1)
1491             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1492             request = compat_urllib_request.Request(result_url)
1493             try:
1494                 data = compat_urllib_request.urlopen(request).read()
1495             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1496                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1497                 return
1498             api_response = json.loads(data)['data']
1499
1500             new_ids = list(video['id'] for video in api_response['items'])
1501             video_ids += new_ids
1502
1503             limit = min(n, api_response['totalItems'])
1504             pagenum += 1
1505
1506         if len(video_ids) > n:
1507             video_ids = video_ids[:n]
1508         for id in video_ids:
1509             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1510         return
1511
1512
1513 class GoogleSearchIE(InfoExtractor):
1514     """Information Extractor for Google Video search queries."""
1515     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1516     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1517     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1518     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1519     _max_google_results = 1000
1520     IE_NAME = u'video.google:search'
1521
1522     def __init__(self, downloader=None):
1523         InfoExtractor.__init__(self, downloader)
1524
1525     def report_download_page(self, query, pagenum):
1526         """Report attempt to download playlist page with given number."""
1527         query = query.decode(preferredencoding())
1528         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1529
1530     def _real_extract(self, query):
1531         mobj = re.match(self._VALID_URL, query)
1532         if mobj is None:
1533             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1534             return
1535
1536         prefix, query = query.split(':')
1537         prefix = prefix[8:]
1538         query = query.encode('utf-8')
1539         if prefix == '':
1540             self._download_n_results(query, 1)
1541             return
1542         elif prefix == 'all':
1543             self._download_n_results(query, self._max_google_results)
1544             return
1545         else:
1546             try:
1547                 n = int(prefix)
1548                 if n <= 0:
1549                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1550                     return
1551                 elif n > self._max_google_results:
1552                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1553                     n = self._max_google_results
1554                 self._download_n_results(query, n)
1555                 return
1556             except ValueError: # parsing prefix as integer fails
1557                 self._download_n_results(query, 1)
1558                 return
1559
1560     def _download_n_results(self, query, n):
1561         """Downloads a specified number of results for a query"""
1562
1563         video_ids = []
1564         pagenum = 0
1565
1566         while True:
1567             self.report_download_page(query, pagenum)
1568             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1569             request = compat_urllib_request.Request(result_url)
1570             try:
1571                 page = compat_urllib_request.urlopen(request).read()
1572             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1573                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1574                 return
1575
1576             # Extract video identifiers
1577             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1578                 video_id = mobj.group(1)
1579                 if video_id not in video_ids:
1580                     video_ids.append(video_id)
1581                     if len(video_ids) == n:
1582                         # Specified n videos reached
1583                         for id in video_ids:
1584                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1585                         return
1586
1587             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1588                 for id in video_ids:
1589                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1590                 return
1591
1592             pagenum = pagenum + 1
1593
1594
1595 class YahooSearchIE(InfoExtractor):
1596     """Information Extractor for Yahoo! Video search queries."""
1597
1598     _WORKING = False
1599     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1600     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1601     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1602     _MORE_PAGES_INDICATOR = r'\s*Next'
1603     _max_yahoo_results = 1000
1604     IE_NAME = u'video.yahoo:search'
1605
1606     def __init__(self, downloader=None):
1607         InfoExtractor.__init__(self, downloader)
1608
1609     def report_download_page(self, query, pagenum):
1610         """Report attempt to download playlist page with given number."""
1611         query = query.decode(preferredencoding())
1612         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1613
1614     def _real_extract(self, query):
1615         mobj = re.match(self._VALID_URL, query)
1616         if mobj is None:
1617             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1618             return
1619
1620         prefix, query = query.split(':')
1621         prefix = prefix[8:]
1622         query = query.encode('utf-8')
1623         if prefix == '':
1624             self._download_n_results(query, 1)
1625             return
1626         elif prefix == 'all':
1627             self._download_n_results(query, self._max_yahoo_results)
1628             return
1629         else:
1630             try:
1631                 n = int(prefix)
1632                 if n <= 0:
1633                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1634                     return
1635                 elif n > self._max_yahoo_results:
1636                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1637                     n = self._max_yahoo_results
1638                 self._download_n_results(query, n)
1639                 return
1640             except ValueError: # parsing prefix as integer fails
1641                 self._download_n_results(query, 1)
1642                 return
1643
1644     def _download_n_results(self, query, n):
1645         """Downloads a specified number of results for a query"""
1646
1647         video_ids = []
1648         already_seen = set()
1649         pagenum = 1
1650
1651         while True:
1652             self.report_download_page(query, pagenum)
1653             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1654             request = compat_urllib_request.Request(result_url)
1655             try:
1656                 page = compat_urllib_request.urlopen(request).read()
1657             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1658                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1659                 return
1660
1661             # Extract video identifiers
1662             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1663                 video_id = mobj.group(1)
1664                 if video_id not in already_seen:
1665                     video_ids.append(video_id)
1666                     already_seen.add(video_id)
1667                     if len(video_ids) == n:
1668                         # Specified n videos reached
1669                         for id in video_ids:
1670                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1671                         return
1672
1673             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1674                 for id in video_ids:
1675                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1676                 return
1677
1678             pagenum = pagenum + 1
1679
1680
1681 class YoutubePlaylistIE(InfoExtractor):
1682     """Information Extractor for YouTube playlists."""
1683
1684     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1685     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1686     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1687     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1688     IE_NAME = u'youtube:playlist'
1689
1690     def __init__(self, downloader=None):
1691         InfoExtractor.__init__(self, downloader)
1692
1693     def report_download_page(self, playlist_id, pagenum):
1694         """Report attempt to download playlist page with given number."""
1695         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1696
1697     def _real_extract(self, url):
1698         # Extract playlist id
1699         mobj = re.match(self._VALID_URL, url)
1700         if mobj is None:
1701             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1702             return
1703
1704         # Single video case
1705         if mobj.group(3) is not None:
1706             self._downloader.download([mobj.group(3)])
1707             return
1708
1709         # Download playlist pages
1710         # prefix is 'p' as default for playlists but there are other types that need extra care
1711         playlist_prefix = mobj.group(1)
1712         if playlist_prefix == 'a':
1713             playlist_access = 'artist'
1714         else:
1715             playlist_prefix = 'p'
1716             playlist_access = 'view_play_list'
1717         playlist_id = mobj.group(2)
1718         video_ids = []
1719         pagenum = 1
1720
1721         while True:
1722             self.report_download_page(playlist_id, pagenum)
1723             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1724             request = compat_urllib_request.Request(url)
1725             try:
1726                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1727             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1728                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1729                 return
1730
1731             # Extract video identifiers
1732             ids_in_page = []
1733             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1734                 if mobj.group(1) not in ids_in_page:
1735                     ids_in_page.append(mobj.group(1))
1736             video_ids.extend(ids_in_page)
1737
1738             if self._MORE_PAGES_INDICATOR not in page:
1739                 break
1740             pagenum = pagenum + 1
1741
1742         total = len(video_ids)
1743
1744         playliststart = self._downloader.params.get('playliststart', 1) - 1
1745         playlistend = self._downloader.params.get('playlistend', -1)
1746         if playlistend == -1:
1747             video_ids = video_ids[playliststart:]
1748         else:
1749             video_ids = video_ids[playliststart:playlistend]
1750
1751         if len(video_ids) == total:
1752             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1753         else:
1754             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1755
1756         for id in video_ids:
1757             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1758         return
1759
1760
1761 class YoutubeChannelIE(InfoExtractor):
1762     """Information Extractor for YouTube channels."""
1763
1764     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1765     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1766     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1767     IE_NAME = u'youtube:channel'
1768
1769     def report_download_page(self, channel_id, pagenum):
1770         """Report attempt to download channel page with given number."""
1771         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1772
1773     def _real_extract(self, url):
1774         # Extract channel id
1775         mobj = re.match(self._VALID_URL, url)
1776         if mobj is None:
1777             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1778             return
1779
1780         # Download channel pages
1781         channel_id = mobj.group(1)
1782         video_ids = []
1783         pagenum = 1
1784
1785         while True:
1786             self.report_download_page(channel_id, pagenum)
1787             url = self._TEMPLATE_URL % (channel_id, pagenum)
1788             request = compat_urllib_request.Request(url)
1789             try:
1790                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1791             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1792                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1793                 return
1794
1795             # Extract video identifiers
1796             ids_in_page = []
1797             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1798                 if mobj.group(1) not in ids_in_page:
1799                     ids_in_page.append(mobj.group(1))
1800             video_ids.extend(ids_in_page)
1801
1802             if self._MORE_PAGES_INDICATOR not in page:
1803                 break
1804             pagenum = pagenum + 1
1805
1806         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1807
1808         for id in video_ids:
1809             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1810         return
1811
1812
1813 class YoutubeUserIE(InfoExtractor):
1814     """Information Extractor for YouTube users."""
1815
1816     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1817     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1818     _GDATA_PAGE_SIZE = 50
1819     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1820     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1821     IE_NAME = u'youtube:user'
1822
1823     def __init__(self, downloader=None):
1824         InfoExtractor.__init__(self, downloader)
1825
1826     def report_download_page(self, username, start_index):
1827         """Report attempt to download user page."""
1828         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1829                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1830
1831     def _real_extract(self, url):
1832         # Extract username
1833         mobj = re.match(self._VALID_URL, url)
1834         if mobj is None:
1835             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1836             return
1837
1838         username = mobj.group(1)
1839
1840         # Download video ids using YouTube Data API. Result size per
1841         # query is limited (currently to 50 videos) so we need to query
1842         # page by page until there are no video ids - it means we got
1843         # all of them.
1844
1845         video_ids = []
1846         pagenum = 0
1847
1848         while True:
1849             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1850             self.report_download_page(username, start_index)
1851
1852             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1853
1854             try:
1855                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1856             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1857                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1858                 return
1859
1860             # Extract video identifiers
1861             ids_in_page = []
1862
1863             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1864                 if mobj.group(1) not in ids_in_page:
1865                     ids_in_page.append(mobj.group(1))
1866
1867             video_ids.extend(ids_in_page)
1868
1869             # A little optimization - if current page is not
1870             # "full", ie. does not contain PAGE_SIZE video ids then
1871             # we can assume that this page is the last one - there
1872             # are no more ids on further pages - no need to query
1873             # again.
1874
1875             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1876                 break
1877
1878             pagenum += 1
1879
1880         all_ids_count = len(video_ids)
1881         playliststart = self._downloader.params.get('playliststart', 1) - 1
1882         playlistend = self._downloader.params.get('playlistend', -1)
1883
1884         if playlistend == -1:
1885             video_ids = video_ids[playliststart:]
1886         else:
1887             video_ids = video_ids[playliststart:playlistend]
1888
1889         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1890                 (username, all_ids_count, len(video_ids)))
1891
1892         for video_id in video_ids:
1893             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1894
1895
1896 class BlipTVUserIE(InfoExtractor):
1897     """Information Extractor for blip.tv users."""
1898
1899     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1900     _PAGE_SIZE = 12
1901     IE_NAME = u'blip.tv:user'
1902
1903     def __init__(self, downloader=None):
1904         InfoExtractor.__init__(self, downloader)
1905
1906     def report_download_page(self, username, pagenum):
1907         """Report attempt to download user page."""
1908         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1909                 (self.IE_NAME, username, pagenum))
1910
1911     def _real_extract(self, url):
1912         # Extract username
1913         mobj = re.match(self._VALID_URL, url)
1914         if mobj is None:
1915             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1916             return
1917
1918         username = mobj.group(1)
1919
1920         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1921
1922         request = compat_urllib_request.Request(url)
1923
1924         try:
1925             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1926             mobj = re.search(r'data-users-id="([^"]+)"', page)
1927             page_base = page_base % mobj.group(1)
1928         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1929             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1930             return
1931
1932
1933         # Download video ids using BlipTV Ajax calls. Result size per
1934         # query is limited (currently to 12 videos) so we need to query
1935         # page by page until there are no video ids - it means we got
1936         # all of them.
1937
1938         video_ids = []
1939         pagenum = 1
1940
1941         while True:
1942             self.report_download_page(username, pagenum)
1943
1944             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1945
1946             try:
1947                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1948             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1949                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1950                 return
1951
1952             # Extract video identifiers
1953             ids_in_page = []
1954
1955             for mobj in re.finditer(r'href="/([^"]+)"', page):
1956                 if mobj.group(1) not in ids_in_page:
1957                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1958
1959             video_ids.extend(ids_in_page)
1960
1961             # A little optimization - if current page is not
1962             # "full", ie. does not contain PAGE_SIZE video ids then
1963             # we can assume that this page is the last one - there
1964             # are no more ids on further pages - no need to query
1965             # again.
1966
1967             if len(ids_in_page) < self._PAGE_SIZE:
1968                 break
1969
1970             pagenum += 1
1971
1972         all_ids_count = len(video_ids)
1973         playliststart = self._downloader.params.get('playliststart', 1) - 1
1974         playlistend = self._downloader.params.get('playlistend', -1)
1975
1976         if playlistend == -1:
1977             video_ids = video_ids[playliststart:]
1978         else:
1979             video_ids = video_ids[playliststart:playlistend]
1980
1981         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1982                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1983
1984         for video_id in video_ids:
1985             self._downloader.download([u'http://blip.tv/'+video_id])
1986
1987
1988 class DepositFilesIE(InfoExtractor):
1989     """Information extractor for depositfiles.com"""
1990
1991     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1992
1993     def report_download_webpage(self, file_id):
1994         """Report webpage download."""
1995         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1996
1997     def report_extraction(self, file_id):
1998         """Report information extraction."""
1999         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2000
2001     def _real_extract(self, url):
2002         file_id = url.split('/')[-1]
2003         # Rebuild url in english locale
2004         url = 'http://depositfiles.com/en/files/' + file_id
2005
2006         # Retrieve file webpage with 'Free download' button pressed
2007         free_download_indication = { 'gateway_result' : '1' }
2008         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2009         try:
2010             self.report_download_webpage(file_id)
2011             webpage = compat_urllib_request.urlopen(request).read()
2012         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2013             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2014             return
2015
2016         # Search for the real file URL
2017         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2018         if (mobj is None) or (mobj.group(1) is None):
2019             # Try to figure out reason of the error.
2020             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2021             if (mobj is not None) and (mobj.group(1) is not None):
2022                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2023                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2024             else:
2025                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2026             return
2027
2028         file_url = mobj.group(1)
2029         file_extension = os.path.splitext(file_url)[1][1:]
2030
2031         # Search for file title
2032         mobj = re.search(r'<b title="(.*?)">', webpage)
2033         if mobj is None:
2034             self._downloader.trouble(u'ERROR: unable to extract title')
2035             return
2036         file_title = mobj.group(1).decode('utf-8')
2037
2038         return [{
2039             'id':       file_id.decode('utf-8'),
2040             'url':      file_url.decode('utf-8'),
2041             'uploader': None,
2042             'upload_date':  None,
2043             'title':    file_title,
2044             'ext':      file_extension.decode('utf-8'),
2045         }]
2046
2047
2048 class FacebookIE(InfoExtractor):
2049     """Information Extractor for Facebook"""
2050
2051     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2052     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2053     _NETRC_MACHINE = 'facebook'
2054     IE_NAME = u'facebook'
2055
2056     def report_login(self):
2057         """Report attempt to log in."""
2058         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2059
2060     def _real_initialize(self):
2061         if self._downloader is None:
2062             return
2063
2064         useremail = None
2065         password = None
2066         downloader_params = self._downloader.params
2067
2068         # Attempt to use provided username and password or .netrc data
2069         if downloader_params.get('username', None) is not None:
2070             useremail = downloader_params['username']
2071             password = downloader_params['password']
2072         elif downloader_params.get('usenetrc', False):
2073             try:
2074                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2075                 if info is not None:
2076                     useremail = info[0]
2077                     password = info[2]
2078                 else:
2079                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2080             except (IOError, netrc.NetrcParseError) as err:
2081                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2082                 return
2083
2084         if useremail is None:
2085             return
2086
2087         # Log in
2088         login_form = {
2089             'email': useremail,
2090             'pass': password,
2091             'login': 'Log+In'
2092             }
2093         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2094         try:
2095             self.report_login()
2096             login_results = compat_urllib_request.urlopen(request).read()
2097             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2098                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2099                 return
2100         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2101             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2102             return
2103
2104     def _real_extract(self, url):
2105         mobj = re.match(self._VALID_URL, url)
2106         if mobj is None:
2107             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2108             return
2109         video_id = mobj.group('ID')
2110
2111         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2112         webpage = self._download_webpage(url, video_id)
2113
2114         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2115         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2116         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2117         if not m:
2118             raise ExtractorError(u'Cannot parse data')
2119         data = dict(json.loads(m.group(1)))
2120         params_raw = compat_urllib_parse.unquote(data['params'])
2121         params = json.loads(params_raw)
2122         video_url = params['hd_src']
2123         if not video_url:
2124             video_url = params['sd_src']
2125         if not video_url:
2126             raise ExtractorError(u'Cannot find video URL')
2127         video_duration = int(params['video_duration'])
2128
2129         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2130         if not m:
2131             raise ExtractorError(u'Cannot find title in webpage')
2132         video_title = unescapeHTML(m.group(1))
2133
2134         info = {
2135             'id': video_id,
2136             'title': video_title,
2137             'url': video_url,
2138             'ext': 'mp4',
2139             'duration': video_duration,
2140             'thumbnail': params['thumbnail_src'],
2141         }
2142         return [info]
2143
2144
2145 class BlipTVIE(InfoExtractor):
2146     """Information extractor for blip.tv"""
2147
2148     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2149     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2150     IE_NAME = u'blip.tv'
2151
2152     def report_extraction(self, file_id):
2153         """Report information extraction."""
2154         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2155
2156     def report_direct_download(self, title):
2157         """Report information extraction."""
2158         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2159
2160     def _real_extract(self, url):
2161         mobj = re.match(self._VALID_URL, url)
2162         if mobj is None:
2163             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2164             return
2165
2166         if '?' in url:
2167             cchar = '&'
2168         else:
2169             cchar = '?'
2170         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2171         request = compat_urllib_request.Request(json_url)
2172         request.add_header('User-Agent', 'iTunes/10.6.1')
2173         self.report_extraction(mobj.group(1))
2174         info = None
2175         try:
2176             urlh = compat_urllib_request.urlopen(request)
2177             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2178                 basename = url.split('/')[-1]
2179                 title,ext = os.path.splitext(basename)
2180                 title = title.decode('UTF-8')
2181                 ext = ext.replace('.', '')
2182                 self.report_direct_download(title)
2183                 info = {
2184                     'id': title,
2185                     'url': url,
2186                     'uploader': None,
2187                     'upload_date': None,
2188                     'title': title,
2189                     'ext': ext,
2190                     'urlhandle': urlh
2191                 }
2192         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2193             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2194         if info is None: # Regular URL
2195             try:
2196                 json_code_bytes = urlh.read()
2197                 json_code = json_code_bytes.decode('utf-8')
2198             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2199                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2200                 return
2201
2202             try:
2203                 json_data = json.loads(json_code)
2204                 if 'Post' in json_data:
2205                     data = json_data['Post']
2206                 else:
2207                     data = json_data
2208
2209                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2210                 video_url = data['media']['url']
2211                 umobj = re.match(self._URL_EXT, video_url)
2212                 if umobj is None:
2213                     raise ValueError('Can not determine filename extension')
2214                 ext = umobj.group(1)
2215
2216                 info = {
2217                     'id': data['item_id'],
2218                     'url': video_url,
2219                     'uploader': data['display_name'],
2220                     'upload_date': upload_date,
2221                     'title': data['title'],
2222                     'ext': ext,
2223                     'format': data['media']['mimeType'],
2224                     'thumbnail': data['thumbnailUrl'],
2225                     'description': data['description'],
2226                     'player_url': data['embedUrl'],
2227                     'user_agent': 'iTunes/10.6.1',
2228                 }
2229             except (ValueError,KeyError) as err:
2230                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2231                 return
2232
2233         return [info]
2234
2235
2236 class MyVideoIE(InfoExtractor):
2237     """Information Extractor for myvideo.de."""
2238
2239     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2240     IE_NAME = u'myvideo'
2241
2242     def __init__(self, downloader=None):
2243         InfoExtractor.__init__(self, downloader)
2244
2245     def report_extraction(self, video_id):
2246         """Report information extraction."""
2247         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2248
2249     def _real_extract(self,url):
2250         mobj = re.match(self._VALID_URL, url)
2251         if mobj is None:
2252             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2253             return
2254
2255         video_id = mobj.group(1)
2256
2257         # Get video webpage
2258         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2259         webpage = self._download_webpage(webpage_url, video_id)
2260
2261         self.report_extraction(video_id)
2262         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2263                  webpage)
2264         if mobj is None:
2265             self._downloader.trouble(u'ERROR: unable to extract media URL')
2266             return
2267         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2268
2269         mobj = re.search('<title>([^<]+)</title>', webpage)
2270         if mobj is None:
2271             self._downloader.trouble(u'ERROR: unable to extract title')
2272             return
2273
2274         video_title = mobj.group(1)
2275
2276         return [{
2277             'id':       video_id,
2278             'url':      video_url,
2279             'uploader': None,
2280             'upload_date':  None,
2281             'title':    video_title,
2282             'ext':      u'flv',
2283         }]
2284
2285 class ComedyCentralIE(InfoExtractor):
2286     """Information extractor for The Daily Show and Colbert Report """
2287
2288     # urls can be abbreviations like :thedailyshow or :colbert
2289     # urls for episodes like:
2290     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2291     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2292     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2293     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2294                       |(https?://)?(www\.)?
2295                           (?P<showname>thedailyshow|colbertnation)\.com/
2296                          (full-episodes/(?P<episode>.*)|
2297                           (?P<clip>
2298                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2299                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2300                      $"""
2301
2302     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2303
2304     _video_extensions = {
2305         '3500': 'mp4',
2306         '2200': 'mp4',
2307         '1700': 'mp4',
2308         '1200': 'mp4',
2309         '750': 'mp4',
2310         '400': 'mp4',
2311     }
2312     _video_dimensions = {
2313         '3500': '1280x720',
2314         '2200': '960x540',
2315         '1700': '768x432',
2316         '1200': '640x360',
2317         '750': '512x288',
2318         '400': '384x216',
2319     }
2320
2321     def suitable(self, url):
2322         """Receives a URL and returns True if suitable for this IE."""
2323         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2324
2325     def report_extraction(self, episode_id):
2326         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2327
2328     def report_config_download(self, episode_id, media_id):
2329         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2330
2331     def report_index_download(self, episode_id):
2332         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2333
2334     def _print_formats(self, formats):
2335         print('Available formats:')
2336         for x in formats:
2337             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2338
2339
2340     def _real_extract(self, url):
2341         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2342         if mobj is None:
2343             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2344             return
2345
2346         if mobj.group('shortname'):
2347             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2348                 url = u'http://www.thedailyshow.com/full-episodes/'
2349             else:
2350                 url = u'http://www.colbertnation.com/full-episodes/'
2351             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2352             assert mobj is not None
2353
2354         if mobj.group('clip'):
2355             if mobj.group('showname') == 'thedailyshow':
2356                 epTitle = mobj.group('tdstitle')
2357             else:
2358                 epTitle = mobj.group('cntitle')
2359             dlNewest = False
2360         else:
2361             dlNewest = not mobj.group('episode')
2362             if dlNewest:
2363                 epTitle = mobj.group('showname')
2364             else:
2365                 epTitle = mobj.group('episode')
2366
2367         req = compat_urllib_request.Request(url)
2368         self.report_extraction(epTitle)
2369         try:
2370             htmlHandle = compat_urllib_request.urlopen(req)
2371             html = htmlHandle.read()
2372             webpage = html.decode('utf-8')
2373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2374             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2375             return
2376         if dlNewest:
2377             url = htmlHandle.geturl()
2378             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2379             if mobj is None:
2380                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2381                 return
2382             if mobj.group('episode') == '':
2383                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2384                 return
2385             epTitle = mobj.group('episode')
2386
2387         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2388
2389         if len(mMovieParams) == 0:
2390             # The Colbert Report embeds the information in a without
2391             # a URL prefix; so extract the alternate reference
2392             # and then add the URL prefix manually.
2393
2394             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2395             if len(altMovieParams) == 0:
2396                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2397                 return
2398             else:
2399                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2400
2401         uri = mMovieParams[0][1]
2402         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2403         self.report_index_download(epTitle)
2404         try:
2405             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2406         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2407             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2408             return
2409
2410         results = []
2411
2412         idoc = xml.etree.ElementTree.fromstring(indexXml)
2413         itemEls = idoc.findall('.//item')
2414         for partNum,itemEl in enumerate(itemEls):
2415             mediaId = itemEl.findall('./guid')[0].text
2416             shortMediaId = mediaId.split(':')[-1]
2417             showId = mediaId.split(':')[-2].replace('.com', '')
2418             officialTitle = itemEl.findall('./title')[0].text
2419             officialDate = itemEl.findall('./pubDate')[0].text
2420
2421             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2422                         compat_urllib_parse.urlencode({'uri': mediaId}))
2423             configReq = compat_urllib_request.Request(configUrl)
2424             self.report_config_download(epTitle, shortMediaId)
2425             try:
2426                 configXml = compat_urllib_request.urlopen(configReq).read()
2427             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2428                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2429                 return
2430
2431             cdoc = xml.etree.ElementTree.fromstring(configXml)
2432             turls = []
2433             for rendition in cdoc.findall('.//rendition'):
2434                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2435                 turls.append(finfo)
2436
2437             if len(turls) == 0:
2438                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2439                 continue
2440
2441             if self._downloader.params.get('listformats', None):
2442                 self._print_formats([i[0] for i in turls])
2443                 return
2444
2445             # For now, just pick the highest bitrate
2446             format,rtmp_video_url = turls[-1]
2447
2448             # Get the format arg from the arg stream
2449             req_format = self._downloader.params.get('format', None)
2450
2451             # Select format if we can find one
2452             for f,v in turls:
2453                 if f == req_format:
2454                     format, rtmp_video_url = f, v
2455                     break
2456
2457             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2458             if not m:
2459                 raise ExtractorError(u'Cannot transform RTMP url')
2460             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2461             video_url = base + m.group('finalid')
2462
2463             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2464             info = {
2465                 'id': shortMediaId,
2466                 'url': video_url,
2467                 'uploader': showId,
2468                 'upload_date': officialDate,
2469                 'title': effTitle,
2470                 'ext': 'mp4',
2471                 'format': format,
2472                 'thumbnail': None,
2473                 'description': officialTitle,
2474             }
2475             results.append(info)
2476
2477         return results
2478
2479
2480 class EscapistIE(InfoExtractor):
2481     """Information extractor for The Escapist """
2482
2483     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2484     IE_NAME = u'escapist'
2485
2486     def report_extraction(self, showName):
2487         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2488
2489     def report_config_download(self, showName):
2490         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2491
2492     def _real_extract(self, url):
2493         mobj = re.match(self._VALID_URL, url)
2494         if mobj is None:
2495             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2496             return
2497         showName = mobj.group('showname')
2498         videoId = mobj.group('episode')
2499
2500         self.report_extraction(showName)
2501         try:
2502             webPage = compat_urllib_request.urlopen(url)
2503             webPageBytes = webPage.read()
2504             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2505             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2506         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2507             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2508             return
2509
2510         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2511         description = unescapeHTML(descMatch.group(1))
2512         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2513         imgUrl = unescapeHTML(imgMatch.group(1))
2514         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2515         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2516         configUrlMatch = re.search('config=(.*)$', playerUrl)
2517         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2518
2519         self.report_config_download(showName)
2520         try:
2521             configJSON = compat_urllib_request.urlopen(configUrl)
2522             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2523             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2524         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2525             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2526             return
2527
2528         # Technically, it's JavaScript, not JSON
2529         configJSON = configJSON.replace("'", '"')
2530
2531         try:
2532             config = json.loads(configJSON)
2533         except (ValueError,) as err:
2534             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2535             return
2536
2537         playlist = config['playlist']
2538         videoUrl = playlist[1]['url']
2539
2540         info = {
2541             'id': videoId,
2542             'url': videoUrl,
2543             'uploader': showName,
2544             'upload_date': None,
2545             'title': showName,
2546             'ext': 'flv',
2547             'thumbnail': imgUrl,
2548             'description': description,
2549             'player_url': playerUrl,
2550         }
2551
2552         return [info]
2553
2554 class CollegeHumorIE(InfoExtractor):
2555     """Information extractor for collegehumor.com"""
2556
2557     _WORKING = False
2558     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2559     IE_NAME = u'collegehumor'
2560
2561     def report_manifest(self, video_id):
2562         """Report information extraction."""
2563         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2564
2565     def report_extraction(self, video_id):
2566         """Report information extraction."""
2567         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2568
2569     def _real_extract(self, url):
2570         mobj = re.match(self._VALID_URL, url)
2571         if mobj is None:
2572             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2573             return
2574         video_id = mobj.group('videoid')
2575
2576         info = {
2577             'id': video_id,
2578             'uploader': None,
2579             'upload_date': None,
2580         }
2581
2582         self.report_extraction(video_id)
2583         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2584         try:
2585             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2586         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2587             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2588             return
2589
2590         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2591         try:
2592             videoNode = mdoc.findall('./video')[0]
2593             info['description'] = videoNode.findall('./description')[0].text
2594             info['title'] = videoNode.findall('./caption')[0].text
2595             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2596             manifest_url = videoNode.findall('./file')[0].text
2597         except IndexError:
2598             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2599             return
2600
2601         manifest_url += '?hdcore=2.10.3'
2602         self.report_manifest(video_id)
2603         try:
2604             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2605         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2606             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2607             return
2608
2609         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2610         try:
2611             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2612             node_id = media_node.attrib['url']
2613             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2614         except IndexError as err:
2615             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2616             return
2617
2618         url_pr = compat_urllib_parse_urlparse(manifest_url)
2619         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2620
2621         info['url'] = url
2622         info['ext'] = 'f4f'
2623         return [info]
2624
2625
2626 class XVideosIE(InfoExtractor):
2627     """Information extractor for xvideos.com"""
2628
2629     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2630     IE_NAME = u'xvideos'
2631
2632     def report_extraction(self, video_id):
2633         """Report information extraction."""
2634         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2635
2636     def _real_extract(self, url):
2637         mobj = re.match(self._VALID_URL, url)
2638         if mobj is None:
2639             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2640             return
2641         video_id = mobj.group(1)
2642
2643         webpage = self._download_webpage(url, video_id)
2644
2645         self.report_extraction(video_id)
2646
2647
2648         # Extract video URL
2649         mobj = re.search(r'flv_url=(.+?)&', webpage)
2650         if mobj is None:
2651             self._downloader.trouble(u'ERROR: unable to extract video url')
2652             return
2653         video_url = compat_urllib_parse.unquote(mobj.group(1))
2654
2655
2656         # Extract title
2657         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2658         if mobj is None:
2659             self._downloader.trouble(u'ERROR: unable to extract video title')
2660             return
2661         video_title = mobj.group(1)
2662
2663
2664         # Extract video thumbnail
2665         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2666         if mobj is None:
2667             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2668             return
2669         video_thumbnail = mobj.group(0)
2670
2671         info = {
2672             'id': video_id,
2673             'url': video_url,
2674             'uploader': None,
2675             'upload_date': None,
2676             'title': video_title,
2677             'ext': 'flv',
2678             'thumbnail': video_thumbnail,
2679             'description': None,
2680         }
2681
2682         return [info]
2683
2684
2685 class SoundcloudIE(InfoExtractor):
2686     """Information extractor for soundcloud.com
2687        To access the media, the uid of the song and a stream token
2688        must be extracted from the page source and the script must make
2689        a request to media.soundcloud.com/crossdomain.xml. Then
2690        the media can be grabbed by requesting from an url composed
2691        of the stream token and uid
2692      """
2693
2694     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2695     IE_NAME = u'soundcloud'
2696
2697     def __init__(self, downloader=None):
2698         InfoExtractor.__init__(self, downloader)
2699
2700     def report_resolve(self, video_id):
2701         """Report information extraction."""
2702         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2703
2704     def report_extraction(self, video_id):
2705         """Report information extraction."""
2706         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2707
2708     def _real_extract(self, url):
2709         mobj = re.match(self._VALID_URL, url)
2710         if mobj is None:
2711             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2712             return
2713
2714         # extract uploader (which is in the url)
2715         uploader = mobj.group(1)
2716         # extract simple title (uploader + slug of song title)
2717         slug_title =  mobj.group(2)
2718         simple_title = uploader + u'-' + slug_title
2719
2720         self.report_resolve('%s/%s' % (uploader, slug_title))
2721
2722         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2723         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2724         request = compat_urllib_request.Request(resolv_url)
2725         try:
2726             info_json_bytes = compat_urllib_request.urlopen(request).read()
2727             info_json = info_json_bytes.decode('utf-8')
2728         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2729             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2730             return
2731
2732         info = json.loads(info_json)
2733         video_id = info['id']
2734         self.report_extraction('%s/%s' % (uploader, slug_title))
2735
2736         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2737         request = compat_urllib_request.Request(streams_url)
2738         try:
2739             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2740             stream_json = stream_json_bytes.decode('utf-8')
2741         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2742             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2743             return
2744
2745         streams = json.loads(stream_json)
2746         mediaURL = streams['http_mp3_128_url']
2747
2748         return [{
2749             'id':       info['id'],
2750             'url':      mediaURL,
2751             'uploader': info['user']['username'],
2752             'upload_date':  info['created_at'],
2753             'title':    info['title'],
2754             'ext':      u'mp3',
2755             'description': info['description'],
2756         }]
2757
2758
2759 class InfoQIE(InfoExtractor):
2760     """Information extractor for infoq.com"""
2761     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2762
2763     def report_extraction(self, video_id):
2764         """Report information extraction."""
2765         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2766
2767     def _real_extract(self, url):
2768         mobj = re.match(self._VALID_URL, url)
2769         if mobj is None:
2770             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2771             return
2772
2773         webpage = self._download_webpage(url, video_id=url)
2774         self.report_extraction(url)
2775
2776         # Extract video URL
2777         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2778         if mobj is None:
2779             self._downloader.trouble(u'ERROR: unable to extract video url')
2780             return
2781         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2782         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2783
2784         # Extract title
2785         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2786         if mobj is None:
2787             self._downloader.trouble(u'ERROR: unable to extract video title')
2788             return
2789         video_title = mobj.group(1)
2790
2791         # Extract description
2792         video_description = u'No description available.'
2793         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2794         if mobj is not None:
2795             video_description = mobj.group(1)
2796
2797         video_filename = video_url.split('/')[-1]
2798         video_id, extension = video_filename.split('.')
2799
2800         info = {
2801             'id': video_id,
2802             'url': video_url,
2803             'uploader': None,
2804             'upload_date': None,
2805             'title': video_title,
2806             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2807             'thumbnail': None,
2808             'description': video_description,
2809         }
2810
2811         return [info]
2812
2813 class MixcloudIE(InfoExtractor):
2814     """Information extractor for www.mixcloud.com"""
2815
2816     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2817     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2818     IE_NAME = u'mixcloud'
2819
2820     def __init__(self, downloader=None):
2821         InfoExtractor.__init__(self, downloader)
2822
2823     def report_download_json(self, file_id):
2824         """Report JSON download."""
2825         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2826
2827     def report_extraction(self, file_id):
2828         """Report information extraction."""
2829         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2830
2831     def get_urls(self, jsonData, fmt, bitrate='best'):
2832         """Get urls from 'audio_formats' section in json"""
2833         file_url = None
2834         try:
2835             bitrate_list = jsonData[fmt]
2836             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2837                 bitrate = max(bitrate_list) # select highest
2838
2839             url_list = jsonData[fmt][bitrate]
2840         except TypeError: # we have no bitrate info.
2841             url_list = jsonData[fmt]
2842         return url_list
2843
2844     def check_urls(self, url_list):
2845         """Returns 1st active url from list"""
2846         for url in url_list:
2847             try:
2848                 compat_urllib_request.urlopen(url)
2849                 return url
2850             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2851                 url = None
2852
2853         return None
2854
2855     def _print_formats(self, formats):
2856         print('Available formats:')
2857         for fmt in formats.keys():
2858             for b in formats[fmt]:
2859                 try:
2860                     ext = formats[fmt][b][0]
2861                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2862                 except TypeError: # we have no bitrate info
2863                     ext = formats[fmt][0]
2864                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2865                     break
2866
2867     def _real_extract(self, url):
2868         mobj = re.match(self._VALID_URL, url)
2869         if mobj is None:
2870             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2871             return
2872         # extract uploader & filename from url
2873         uploader = mobj.group(1).decode('utf-8')
2874         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2875
2876         # construct API request
2877         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2878         # retrieve .json file with links to files
2879         request = compat_urllib_request.Request(file_url)
2880         try:
2881             self.report_download_json(file_url)
2882             jsonData = compat_urllib_request.urlopen(request).read()
2883         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2884             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2885             return
2886
2887         # parse JSON
2888         json_data = json.loads(jsonData)
2889         player_url = json_data['player_swf_url']
2890         formats = dict(json_data['audio_formats'])
2891
2892         req_format = self._downloader.params.get('format', None)
2893         bitrate = None
2894
2895         if self._downloader.params.get('listformats', None):
2896             self._print_formats(formats)
2897             return
2898
2899         if req_format is None or req_format == 'best':
2900             for format_param in formats.keys():
2901                 url_list = self.get_urls(formats, format_param)
2902                 # check urls
2903                 file_url = self.check_urls(url_list)
2904                 if file_url is not None:
2905                     break # got it!
2906         else:
2907             if req_format not in formats:
2908                 self._downloader.trouble(u'ERROR: format is not available')
2909                 return
2910
2911             url_list = self.get_urls(formats, req_format)
2912             file_url = self.check_urls(url_list)
2913             format_param = req_format
2914
2915         return [{
2916             'id': file_id.decode('utf-8'),
2917             'url': file_url.decode('utf-8'),
2918             'uploader': uploader.decode('utf-8'),
2919             'upload_date': None,
2920             'title': json_data['name'],
2921             'ext': file_url.split('.')[-1].decode('utf-8'),
2922             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2923             'thumbnail': json_data['thumbnail_url'],
2924             'description': json_data['description'],
2925             'player_url': player_url.decode('utf-8'),
2926         }]
2927
2928 class StanfordOpenClassroomIE(InfoExtractor):
2929     """Information extractor for Stanford's Open ClassRoom"""
2930
2931     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2932     IE_NAME = u'stanfordoc'
2933
2934     def report_download_webpage(self, objid):
2935         """Report information extraction."""
2936         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2937
2938     def report_extraction(self, video_id):
2939         """Report information extraction."""
2940         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2941
2942     def _real_extract(self, url):
2943         mobj = re.match(self._VALID_URL, url)
2944         if mobj is None:
2945             raise ExtractorError(u'Invalid URL: %s' % url)
2946
2947         if mobj.group('course') and mobj.group('video'): # A specific video
2948             course = mobj.group('course')
2949             video = mobj.group('video')
2950             info = {
2951                 'id': course + '_' + video,
2952                 'uploader': None,
2953                 'upload_date': None,
2954             }
2955
2956             self.report_extraction(info['id'])
2957             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2958             xmlUrl = baseUrl + video + '.xml'
2959             try:
2960                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2961             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2962                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2963                 return
2964             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2965             try:
2966                 info['title'] = mdoc.findall('./title')[0].text
2967                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2968             except IndexError:
2969                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2970                 return
2971             info['ext'] = info['url'].rpartition('.')[2]
2972             return [info]
2973         elif mobj.group('course'): # A course page
2974             course = mobj.group('course')
2975             info = {
2976                 'id': course,
2977                 'type': 'playlist',
2978                 'uploader': None,
2979                 'upload_date': None,
2980             }
2981
2982             coursepage = self._download_webpage(url, info['id'],
2983                                         note='Downloading course info page',
2984                                         errnote='Unable to download course info page')
2985
2986             m = re.search('<h1>([^<]+)</h1>', coursepage)
2987             if m:
2988                 info['title'] = unescapeHTML(m.group(1))
2989             else:
2990                 info['title'] = info['id']
2991
2992             m = re.search('<description>([^<]+)</description>', coursepage)
2993             if m:
2994                 info['description'] = unescapeHTML(m.group(1))
2995
2996             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2997             info['list'] = [
2998                 {
2999                     'type': 'reference',
3000                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3001                 }
3002                     for vpage in links]
3003             results = []
3004             for entry in info['list']:
3005                 assert entry['type'] == 'reference'
3006                 results += self.extract(entry['url'])
3007             return results
3008         else: # Root page
3009             info = {
3010                 'id': 'Stanford OpenClassroom',
3011                 'type': 'playlist',
3012                 'uploader': None,
3013                 'upload_date': None,
3014             }
3015
3016             self.report_download_webpage(info['id'])
3017             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3018             try:
3019                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3020             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3021                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3022                 return
3023
3024             info['title'] = info['id']
3025
3026             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3027             info['list'] = [
3028                 {
3029                     'type': 'reference',
3030                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3031                 }
3032                     for cpage in links]
3033
3034             results = []
3035             for entry in info['list']:
3036                 assert entry['type'] == 'reference'
3037                 results += self.extract(entry['url'])
3038             return results
3039
3040 class MTVIE(InfoExtractor):
3041     """Information extractor for MTV.com"""
3042
3043     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3044     IE_NAME = u'mtv'
3045
3046     def report_extraction(self, video_id):
3047         """Report information extraction."""
3048         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3049
3050     def _real_extract(self, url):
3051         mobj = re.match(self._VALID_URL, url)
3052         if mobj is None:
3053             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3054             return
3055         if not mobj.group('proto'):
3056             url = 'http://' + url
3057         video_id = mobj.group('videoid')
3058
3059         webpage = self._download_webpage(url, video_id)
3060
3061         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3062         if mobj is None:
3063             self._downloader.trouble(u'ERROR: unable to extract song name')
3064             return
3065         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3066         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3067         if mobj is None:
3068             self._downloader.trouble(u'ERROR: unable to extract performer')
3069             return
3070         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3071         video_title = performer + ' - ' + song_name
3072
3073         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3074         if mobj is None:
3075             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3076             return
3077         mtvn_uri = mobj.group(1)
3078
3079         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3080         if mobj is None:
3081             self._downloader.trouble(u'ERROR: unable to extract content id')
3082             return
3083         content_id = mobj.group(1)
3084
3085         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3086         self.report_extraction(video_id)
3087         request = compat_urllib_request.Request(videogen_url)
3088         try:
3089             metadataXml = compat_urllib_request.urlopen(request).read()
3090         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3091             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3092             return
3093
3094         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3095         renditions = mdoc.findall('.//rendition')
3096
3097         # For now, always pick the highest quality.
3098         rendition = renditions[-1]
3099
3100         try:
3101             _,_,ext = rendition.attrib['type'].partition('/')
3102             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3103             video_url = rendition.find('./src').text
3104         except KeyError:
3105             self._downloader.trouble('Invalid rendition field.')
3106             return
3107
3108         info = {
3109             'id': video_id,
3110             'url': video_url,
3111             'uploader': performer,
3112             'upload_date': None,
3113             'title': video_title,
3114             'ext': ext,
3115             'format': format,
3116         }
3117
3118         return [info]
3119
3120
3121 class YoukuIE(InfoExtractor):
3122     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3123
3124     def report_download_webpage(self, file_id):
3125         """Report webpage download."""
3126         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3127
3128     def report_extraction(self, file_id):
3129         """Report information extraction."""
3130         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3131
3132     def _gen_sid(self):
3133         nowTime = int(time.time() * 1000)
3134         random1 = random.randint(1000,1998)
3135         random2 = random.randint(1000,9999)
3136
3137         return "%d%d%d" %(nowTime,random1,random2)
3138
3139     def _get_file_ID_mix_string(self, seed):
3140         mixed = []
3141         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3142         seed = float(seed)
3143         for i in range(len(source)):
3144             seed  =  (seed * 211 + 30031 ) % 65536
3145             index  =  math.floor(seed / 65536 * len(source) )
3146             mixed.append(source[int(index)])
3147             source.remove(source[int(index)])
3148         #return ''.join(mixed)
3149         return mixed
3150
3151     def _get_file_id(self, fileId, seed):
3152         mixed = self._get_file_ID_mix_string(seed)
3153         ids = fileId.split('*')
3154         realId = []
3155         for ch in ids:
3156             if ch:
3157                 realId.append(mixed[int(ch)])
3158         return ''.join(realId)
3159
3160     def _real_extract(self, url):
3161         mobj = re.match(self._VALID_URL, url)
3162         if mobj is None:
3163             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3164             return
3165         video_id = mobj.group('ID')
3166
3167         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3168
3169         request = compat_urllib_request.Request(info_url, None, std_headers)
3170         try:
3171             self.report_download_webpage(video_id)
3172             jsondata = compat_urllib_request.urlopen(request).read()
3173         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3174             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3175             return
3176
3177         self.report_extraction(video_id)
3178         try:
3179             jsonstr = jsondata.decode('utf-8')
3180             config = json.loads(jsonstr)
3181
3182             video_title =  config['data'][0]['title']
3183             seed = config['data'][0]['seed']
3184
3185             format = self._downloader.params.get('format', None)
3186             supported_format = list(config['data'][0]['streamfileids'].keys())
3187
3188             if format is None or format == 'best':
3189                 if 'hd2' in supported_format:
3190                     format = 'hd2'
3191                 else:
3192                     format = 'flv'
3193                 ext = u'flv'
3194             elif format == 'worst':
3195                 format = 'mp4'
3196                 ext = u'mp4'
3197             else:
3198                 format = 'flv'
3199                 ext = u'flv'
3200
3201
3202             fileid = config['data'][0]['streamfileids'][format]
3203             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3204         except (UnicodeDecodeError, ValueError, KeyError):
3205             self._downloader.trouble(u'ERROR: unable to extract info section')
3206             return
3207
3208         files_info=[]
3209         sid = self._gen_sid()
3210         fileid = self._get_file_id(fileid, seed)
3211
3212         #column 8,9 of fileid represent the segment number
3213         #fileid[7:9] should be changed
3214         for index, key in enumerate(keys):
3215
3216             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3217             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3218
3219             info = {
3220                 'id': '%s_part%02d' % (video_id, index),
3221                 'url': download_url,
3222                 'uploader': None,
3223                 'upload_date': None,
3224                 'title': video_title,
3225                 'ext': ext,
3226             }
3227             files_info.append(info)
3228
3229         return files_info
3230
3231
3232 class XNXXIE(InfoExtractor):
3233     """Information extractor for xnxx.com"""
3234
3235     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3236     IE_NAME = u'xnxx'
3237     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3238     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3239     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3240
3241     def report_webpage(self, video_id):
3242         """Report information extraction"""
3243         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3244
3245     def report_extraction(self, video_id):
3246         """Report information extraction"""
3247         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3248
3249     def _real_extract(self, url):
3250         mobj = re.match(self._VALID_URL, url)
3251         if mobj is None:
3252             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3253             return
3254         video_id = mobj.group(1)
3255
3256         self.report_webpage(video_id)
3257
3258         # Get webpage content
3259         try:
3260             webpage_bytes = compat_urllib_request.urlopen(url).read()
3261             webpage = webpage_bytes.decode('utf-8')
3262         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3263             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3264             return
3265
3266         result = re.search(self.VIDEO_URL_RE, webpage)
3267         if result is None:
3268             self._downloader.trouble(u'ERROR: unable to extract video url')
3269             return
3270         video_url = compat_urllib_parse.unquote(result.group(1))
3271
3272         result = re.search(self.VIDEO_TITLE_RE, webpage)
3273         if result is None:
3274             self._downloader.trouble(u'ERROR: unable to extract video title')
3275             return
3276         video_title = result.group(1)
3277
3278         result = re.search(self.VIDEO_THUMB_RE, webpage)
3279         if result is None:
3280             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3281             return
3282         video_thumbnail = result.group(1)
3283
3284         return [{
3285             'id': video_id,
3286             'url': video_url,
3287             'uploader': None,
3288             'upload_date': None,
3289             'title': video_title,
3290             'ext': 'flv',
3291             'thumbnail': video_thumbnail,
3292             'description': None,
3293         }]
3294
3295
3296 class GooglePlusIE(InfoExtractor):
3297     """Information extractor for plus.google.com."""
3298
3299     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3300     IE_NAME = u'plus.google'
3301
3302     def __init__(self, downloader=None):
3303         InfoExtractor.__init__(self, downloader)
3304
3305     def report_extract_entry(self, url):
3306         """Report downloading extry"""
3307         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3308
3309     def report_date(self, upload_date):
3310         """Report downloading extry"""
3311         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3312
3313     def report_uploader(self, uploader):
3314         """Report downloading extry"""
3315         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3316
3317     def report_title(self, video_title):
3318         """Report downloading extry"""
3319         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3320
3321     def report_extract_vid_page(self, video_page):
3322         """Report information extraction."""
3323         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3324
3325     def _real_extract(self, url):
3326         # Extract id from URL
3327         mobj = re.match(self._VALID_URL, url)
3328         if mobj is None:
3329             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3330             return
3331
3332         post_url = mobj.group(0)
3333         video_id = mobj.group(1)
3334
3335         video_extension = 'flv'
3336
3337         # Step 1, Retrieve post webpage to extract further information
3338         self.report_extract_entry(post_url)
3339         request = compat_urllib_request.Request(post_url)
3340         try:
3341             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3342         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3343             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3344             return
3345
3346         # Extract update date
3347         upload_date = None
3348         pattern = 'title="Timestamp">(.*?)</a>'
3349         mobj = re.search(pattern, webpage)
3350         if mobj:
3351             upload_date = mobj.group(1)
3352             # Convert timestring to a format suitable for filename
3353             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3354             upload_date = upload_date.strftime('%Y%m%d')
3355         self.report_date(upload_date)
3356
3357         # Extract uploader
3358         uploader = None
3359         pattern = r'rel\="author".*?>(.*?)</a>'
3360         mobj = re.search(pattern, webpage)
3361         if mobj:
3362             uploader = mobj.group(1)
3363         self.report_uploader(uploader)
3364
3365         # Extract title
3366         # Get the first line for title
3367         video_title = u'NA'
3368         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3369         mobj = re.search(pattern, webpage)
3370         if mobj:
3371             video_title = mobj.group(1)
3372         self.report_title(video_title)
3373
3374         # Step 2, Stimulate clicking the image box to launch video
3375         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3376         mobj = re.search(pattern, webpage)
3377         if mobj is None:
3378             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3379
3380         video_page = mobj.group(1)
3381         request = compat_urllib_request.Request(video_page)
3382         try:
3383             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3384         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3385             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3386             return
3387         self.report_extract_vid_page(video_page)
3388
3389
3390         # Extract video links on video page
3391         """Extract video links of all sizes"""
3392         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3393         mobj = re.findall(pattern, webpage)
3394         if len(mobj) == 0:
3395             self._downloader.trouble(u'ERROR: unable to extract video links')
3396
3397         # Sort in resolution
3398         links = sorted(mobj)
3399
3400         # Choose the lowest of the sort, i.e. highest resolution
3401         video_url = links[-1]
3402         # Only get the url. The resolution part in the tuple has no use anymore
3403         video_url = video_url[-1]
3404         # Treat escaped \u0026 style hex
3405         try:
3406             video_url = video_url.decode("unicode_escape")
3407         except AttributeError: # Python 3
3408             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3409
3410
3411         return [{
3412             'id':       video_id,
3413             'url':      video_url,
3414             'uploader': uploader,
3415             'upload_date':  upload_date,
3416             'title':    video_title,
3417             'ext':      video_extension,
3418         }]
3419
3420 class NBAIE(InfoExtractor):
3421     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3422     IE_NAME = u'nba'
3423
3424     def _real_extract(self, url):
3425         mobj = re.match(self._VALID_URL, url)
3426         if mobj is None:
3427             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3428             return
3429
3430         video_id = mobj.group(1)
3431         if video_id.endswith('/index.html'):
3432             video_id = video_id[:-len('/index.html')]
3433
3434         webpage = self._download_webpage(url, video_id)
3435
3436         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3437         def _findProp(rexp, default=None):
3438             m = re.search(rexp, webpage)
3439             if m:
3440                 return unescapeHTML(m.group(1))
3441             else:
3442                 return default
3443
3444         shortened_video_id = video_id.rpartition('/')[2]
3445         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3446         info = {
3447             'id': shortened_video_id,
3448             'url': video_url,
3449             'ext': 'mp4',
3450             'title': title,
3451             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3452             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3453         }
3454         return [info]
3455
3456 class JustinTVIE(InfoExtractor):
3457     """Information extractor for justin.tv and twitch.tv"""
3458     # TODO: One broadcast may be split into multiple videos. The key
3459     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3460     # starts at 1 and increases. Can we treat all parts as one video?
3461
3462     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3463         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3464     _JUSTIN_PAGE_LIMIT = 100
3465     IE_NAME = u'justin.tv'
3466
3467     def report_extraction(self, file_id):
3468         """Report information extraction."""
3469         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3470
3471     def report_download_page(self, channel, offset):
3472         """Report attempt to download a single page of videos."""
3473         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3474                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3475
3476     # Return count of items, list of *valid* items
3477     def _parse_page(self, url):
3478         try:
3479             urlh = compat_urllib_request.urlopen(url)
3480             webpage_bytes = urlh.read()
3481             webpage = webpage_bytes.decode('utf-8', 'ignore')
3482         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3483             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3484             return
3485
3486         response = json.loads(webpage)
3487         if type(response) != list:
3488             error_text = response.get('error', 'unknown error')
3489             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3490             return
3491         info = []
3492         for clip in response:
3493             video_url = clip['video_file_url']
3494             if video_url:
3495                 video_extension = os.path.splitext(video_url)[1][1:]
3496                 video_date = re.sub('-', '', clip['start_time'][:10])
3497                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3498                 video_id = clip['id']
3499                 video_title = clip.get('title', video_id)
3500                 info.append({
3501                     'id': video_id,
3502                     'url': video_url,
3503                     'title': video_title,
3504                     'uploader': clip.get('channel_name', video_uploader_id),
3505                     'uploader_id': video_uploader_id,
3506                     'upload_date': video_date,
3507                     'ext': video_extension,
3508                 })
3509         return (len(response), info)
3510
3511     def _real_extract(self, url):
3512         mobj = re.match(self._VALID_URL, url)
3513         if mobj is None:
3514             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3515             return
3516
3517         api = 'http://api.justin.tv'
3518         video_id = mobj.group(mobj.lastindex)
3519         paged = False
3520         if mobj.lastindex == 1:
3521             paged = True
3522             api += '/channel/archives/%s.json'
3523         else:
3524             api += '/broadcast/by_archive/%s.json'
3525         api = api % (video_id,)
3526
3527         self.report_extraction(video_id)
3528
3529         info = []
3530         offset = 0
3531         limit = self._JUSTIN_PAGE_LIMIT
3532         while True:
3533             if paged:
3534                 self.report_download_page(video_id, offset)
3535             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3536             page_count, page_info = self._parse_page(page_url)
3537             info.extend(page_info)
3538             if not paged or page_count != limit:
3539                 break
3540             offset += limit
3541         return info
3542
3543 class FunnyOrDieIE(InfoExtractor):
3544     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3545
3546     def _real_extract(self, url):
3547         mobj = re.match(self._VALID_URL, url)
3548         if mobj is None:
3549             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3550             return
3551
3552         video_id = mobj.group('id')
3553         webpage = self._download_webpage(url, video_id)
3554
3555         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3556         if not m:
3557             self._downloader.trouble(u'ERROR: unable to find video information')
3558         video_url = unescapeHTML(m.group('url'))
3559
3560         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3561         if not m:
3562             self._downloader.trouble(u'Cannot find video title')
3563         title = unescapeHTML(m.group('title'))
3564
3565         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3566         if m:
3567             desc = unescapeHTML(m.group('desc'))
3568         else:
3569             desc = None
3570
3571         info = {
3572             'id': video_id,
3573             'url': video_url,
3574             'ext': 'mp4',
3575             'title': title,
3576             'description': desc,
3577         }
3578         return [info]
3579
3580 class TweetReelIE(InfoExtractor):
3581     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3582
3583     def _real_extract(self, url):
3584         mobj = re.match(self._VALID_URL, url)
3585         if mobj is None:
3586             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3587             return
3588
3589         video_id = mobj.group('id')
3590         webpage = self._download_webpage(url, video_id)
3591
3592         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3593         if not m:
3594             self._downloader.trouble(u'ERROR: Cannot find status ID')
3595         status_id = m.group(1)
3596
3597         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3598         if not m:
3599             self._downloader.trouble(u'WARNING: Cannot find description')
3600         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3601
3602         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3603         if not m:
3604             self._downloader.trouble(u'ERROR: Cannot find uploader')
3605         uploader = unescapeHTML(m.group('uploader'))
3606         uploader_id = unescapeHTML(m.group('uploader_id'))
3607
3608         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3609         if not m:
3610             self._downloader.trouble(u'ERROR: Cannot find upload date')
3611         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3612
3613         title = desc
3614         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3615
3616         info = {
3617             'id': video_id,
3618             'url': video_url,
3619             'ext': 'mov',
3620             'title': title,
3621             'description': desc,
3622             'uploader': uploader,
3623             'uploader_id': uploader_id,
3624             'internal_id': status_id,
3625             'upload_date': upload_date
3626         }
3627         return [info]
3628
3629 class SteamIE(InfoExtractor):
3630     _VALID_URL = r"""http://store.steampowered.com/
3631                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3632                 (?P<gameID>\d+)/?
3633                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3634                 """
3635
3636     def suitable(self, url):
3637         """Receives a URL and returns True if suitable for this IE."""
3638         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3639
3640     def _real_extract(self, url):
3641         m = re.match(self._VALID_URL, url, re.VERBOSE)
3642         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3643         gameID = m.group('gameID')
3644         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3645         webpage = self._download_webpage(videourl, gameID)
3646         mweb = re.finditer(urlRE, webpage)
3647         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3648         titles = re.finditer(namesRE, webpage)
3649         videos = []
3650         for vid,vtitle in zip(mweb,titles):
3651             video_id = vid.group('videoID')
3652             title = vtitle.group('videoName')
3653             video_url = vid.group('videoURL')
3654             if not video_url:
3655                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3656             info = {
3657                 'id':video_id,
3658                 'url':video_url,
3659                 'ext': 'flv',
3660                 'title': unescapeHTML(title)
3661                   }
3662             videos.append(info)
3663         return videos
3664
3665 class UstreamIE(InfoExtractor):
3666     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3667     IE_NAME = u'ustream'
3668
3669     def _real_extract(self, url):
3670         m = re.match(self._VALID_URL, url)
3671         video_id = m.group('videoID')
3672         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3673         webpage = self._download_webpage(url, video_id)
3674         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3675         title = m.group('title')
3676         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3677         uploader = m.group('uploader')
3678         info = {
3679                 'id':video_id,
3680                 'url':video_url,
3681                 'ext': 'flv',
3682                 'title': title,
3683                 'uploader': uploader
3684                   }
3685         return [info]
3686
3687 class RBMARadioIE(InfoExtractor):
3688     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3689
3690     def _real_extract(self, url):
3691         m = re.match(self._VALID_URL, url)
3692         video_id = m.group('videoID')
3693
3694         webpage = self._download_webpage(url, video_id)
3695         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3696         if not m:
3697             raise ExtractorError(u'Cannot find metadata')
3698         json_data = m.group(1)
3699
3700         try:
3701             data = json.loads(json_data)
3702         except ValueError as e:
3703             raise ExtractorError(u'Invalid JSON: ' + str(e))
3704
3705         video_url = data['akamai_url'] + '&cbr=256'
3706         url_parts = compat_urllib_parse_urlparse(video_url)
3707         video_ext = url_parts.path.rpartition('.')[2]
3708         info = {
3709                 'id': video_id,
3710                 'url': video_url,
3711                 'ext': video_ext,
3712                 'title': data['title'],
3713                 'description': data.get('teaser_text'),
3714                 'location': data.get('country_of_origin'),
3715                 'uploader': data.get('host', {}).get('name'),
3716                 'uploader_id': data.get('host', {}).get('slug'),
3717                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3718                 'duration': data.get('duration'),
3719         }
3720         return [info]
3721
3722
3723 class YouPornIE(InfoExtractor):
3724     """Information extractor for youporn.com."""
3725     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3726
3727     def _print_formats(self, formats):
3728         """Print all available formats"""
3729         print(u'Available formats:')
3730         print(u'ext\t\tformat')
3731         print(u'---------------------------------')
3732         for format in formats:
3733             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3734
3735     def _specific(self, req_format, formats):
3736         for x in formats:
3737             if(x["format"]==req_format):
3738                 return x
3739         return None
3740
3741     def _real_extract(self, url):
3742         mobj = re.match(self._VALID_URL, url)
3743         if mobj is None:
3744             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3745             return
3746
3747         video_id = mobj.group('videoid')
3748
3749         req = compat_urllib_request.Request(url)
3750         req.add_header('Cookie', 'age_verified=1')
3751         webpage = self._download_webpage(req, video_id)
3752
3753         # Get the video title
3754         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3755         if result is None:
3756             raise ExtractorError(u'Unable to extract video title')
3757         video_title = result.group('title').strip()
3758
3759         # Get the video date
3760         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3761         if result is None:
3762             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3763             upload_date = None
3764         else:
3765             upload_date = result.group('date').strip()
3766
3767         # Get the video uploader
3768         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3769         if result is None:
3770             self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3771             video_uploader = None
3772         else:
3773             video_uploader = result.group('uploader').strip()
3774             video_uploader = clean_html( video_uploader )
3775
3776         # Get all of the formats available
3777         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3778         result = re.search(DOWNLOAD_LIST_RE, webpage)
3779         if result is None:
3780             raise ExtractorError(u'Unable to extract download list')
3781         download_list_html = result.group('download_list').strip()
3782
3783         # Get all of the links from the page
3784         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3785         links = re.findall(LINK_RE, download_list_html)
3786         if(len(links) == 0):
3787             raise ExtractorError(u'ERROR: no known formats available for video')
3788
3789         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3790
3791         formats = []
3792         for link in links:
3793
3794             # A link looks like this:
3795             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3796             # A path looks like this:
3797             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3798             video_url = unescapeHTML( link )
3799             path = compat_urllib_parse_urlparse( video_url ).path
3800             extension = os.path.splitext( path )[1][1:]
3801             format = path.split('/')[4].split('_')[:2]
3802             size = format[0]
3803             bitrate = format[1]
3804             format = "-".join( format )
3805             title = u'%s-%s-%s' % (video_title, size, bitrate)
3806
3807             formats.append({
3808                 'id': video_id,
3809                 'url': video_url,
3810                 'uploader': video_uploader,
3811                 'upload_date': upload_date,
3812                 'title': title,
3813                 'ext': extension,
3814                 'format': format,
3815                 'thumbnail': None,
3816                 'description': None,
3817                 'player_url': None
3818             })
3819
3820         if self._downloader.params.get('listformats', None):
3821             self._print_formats(formats)
3822             return
3823
3824         req_format = self._downloader.params.get('format', None)
3825         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3826
3827         if req_format is None or req_format == 'best':
3828             return [formats[0]]
3829         elif req_format == 'worst':
3830             return [formats[-1]]
3831         elif req_format in ('-1', 'all'):
3832             return formats
3833         else:
3834             format = self._specific( req_format, formats )
3835             if result is None:
3836                 self._downloader.trouble(u'ERROR: requested format not available')
3837                 return
3838             return [format]
3839
3840
3841
3842 class PornotubeIE(InfoExtractor):
3843     """Information extractor for pornotube.com."""
3844     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3845
3846     def _real_extract(self, url):
3847         mobj = re.match(self._VALID_URL, url)
3848         if mobj is None:
3849             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3850             return
3851
3852         video_id = mobj.group('videoid')
3853         video_title = mobj.group('title')
3854
3855         # Get webpage content
3856         webpage = self._download_webpage(url, video_id)
3857
3858         # Get the video URL
3859         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3860         result = re.search(VIDEO_URL_RE, webpage)
3861         if result is None:
3862             self._downloader.trouble(u'ERROR: unable to extract video url')
3863             return
3864         video_url = compat_urllib_parse.unquote(result.group('url'))
3865
3866         #Get the uploaded date
3867         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3868         result = re.search(VIDEO_UPLOADED_RE, webpage)
3869         if result is None:
3870             self._downloader.trouble(u'ERROR: unable to extract video title')
3871             return
3872         upload_date = result.group('date')
3873
3874         info = {'id': video_id,
3875                 'url': video_url,
3876                 'uploader': None,
3877                 'upload_date': upload_date,
3878                 'title': video_title,
3879                 'ext': 'flv',
3880                 'format': 'flv'}
3881
3882         return [info]
3883
3884 class YouJizzIE(InfoExtractor):
3885     """Information extractor for youjizz.com."""
3886     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3887
3888     def _real_extract(self, url):
3889         mobj = re.match(self._VALID_URL, url)
3890         if mobj is None:
3891             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3892             return
3893
3894         video_id = mobj.group('videoid')
3895
3896         # Get webpage content
3897         webpage = self._download_webpage(url, video_id)
3898
3899         # Get the video title
3900         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3901         if result is None:
3902             raise ExtractorError(u'ERROR: unable to extract video title')
3903         video_title = result.group('title').strip()
3904
3905         # Get the embed page
3906         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3907         if result is None:
3908             raise ExtractorError(u'ERROR: unable to extract embed page')
3909
3910         embed_page_url = result.group(0).strip()
3911         video_id = result.group('videoid')
3912
3913         webpage = self._download_webpage(embed_page_url, video_id)
3914
3915         # Get the video URL
3916         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3917         if result is None:
3918             raise ExtractorError(u'ERROR: unable to extract video url')
3919         video_url = result.group('source')
3920
3921         info = {'id': video_id,
3922                 'url': video_url,
3923                 'title': video_title,
3924                 'ext': 'flv',
3925                 'format': 'flv',
3926                 'player_url': embed_page_url}
3927
3928         return [info]
3929
3930 class EightTracksIE(InfoExtractor):
3931     IE_NAME = '8tracks'
3932     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3933
3934     def _real_extract(self, url):
3935         mobj = re.match(self._VALID_URL, url)
3936         if mobj is None:
3937             raise ExtractorError(u'Invalid URL: %s' % url)
3938         playlist_id = mobj.group('id')
3939
3940         webpage = self._download_webpage(url, playlist_id)
3941
3942         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3943         if not m:
3944             raise ExtractorError(u'Cannot find trax information')
3945         json_like = m.group(1)
3946         data = json.loads(json_like)
3947
3948         session = str(random.randint(0, 1000000000))
3949         mix_id = data['id']
3950         track_count = data['tracks_count']
3951         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3952         next_url = first_url
3953         res = []
3954         for i in itertools.count():
3955             api_json = self._download_webpage(next_url, playlist_id,
3956                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3957                 errnote=u'Failed to download song information')
3958             api_data = json.loads(api_json)
3959             track_data = api_data[u'set']['track']
3960             info = {
3961                 'id': track_data['id'],
3962                 'url': track_data['track_file_stream_url'],
3963                 'title': track_data['performer'] + u' - ' + track_data['name'],
3964                 'raw_title': track_data['name'],
3965                 'uploader_id': data['user']['login'],
3966                 'ext': 'm4a',
3967             }
3968             res.append(info)
3969             if api_data['set']['at_last_track']:
3970                 break
3971             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3972         return res
3973
3974 class KeekIE(InfoExtractor):
3975     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3976     IE_NAME = u'keek'
3977
3978     def _real_extract(self, url):
3979         m = re.match(self._VALID_URL, url)
3980         video_id = m.group('videoID')
3981         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3982         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3983         webpage = self._download_webpage(url, video_id)
3984         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3985         title = unescapeHTML(m.group('title'))
3986         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3987         uploader = unescapeHTML(m.group('uploader'))
3988         info = {
3989                 'id':video_id,
3990                 'url':video_url,
3991                 'ext': 'mp4',
3992                 'title': title,
3993                 'thumbnail': thumbnail,
3994                 'uploader': uploader
3995         }
3996         return [info]
3997
3998 class TEDIE(InfoExtractor):
3999     _VALID_URL=r'''http://www.ted.com/
4000                    (
4001                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4002                         |
4003                         ((?P<type_talk>talks)) # We have a simple talk
4004                    )
4005                    /(?P<name>\w+) # Here goes the name and then ".html"
4006                    '''
4007
4008     def suitable(self, url):
4009         """Receives a URL and returns True if suitable for this IE."""
4010         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
4011
4012     def _real_extract(self, url):
4013         m=re.match(self._VALID_URL, url, re.VERBOSE)
4014         if m.group('type_talk'):
4015             return [self._talk_info(url)]
4016         else :
4017             playlist_id=m.group('playlist_id')
4018             name=m.group('name')
4019             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4020             return self._playlist_videos_info(url,name,playlist_id)
4021
4022     def _talk_video_link(self,mediaSlug):
4023         '''Returns the video link for that mediaSlug'''
4024         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4025
4026     def _playlist_videos_info(self,url,name,playlist_id=0):
4027         '''Returns the videos of the playlist'''
4028         video_RE=r'''
4029                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4030                      ([.\s]*?)data-playlist_item_id="(\d+)"
4031                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4032                      '''
4033         video_name_RE=r'<p\ class="talk-title"><a href="/talks/(.+).html">(?P<fullname>.+?)</a></p>'
4034         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4035         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4036         m_names=re.finditer(video_name_RE,webpage)
4037         info=[]
4038         for m_video, m_name in zip(m_videos,m_names):
4039             video_dic={
4040                        'id': m_video.group('video_id'),
4041                        'url': self._talk_video_link(m_video.group('mediaSlug')),
4042                        'ext': 'mp4',
4043                        'title': m_name.group('fullname')
4044                        }
4045             info.append(video_dic)
4046         return info
4047     def _talk_info(self, url, video_id=0):
4048         """Return the video for the talk in the url"""
4049         m=re.match(self._VALID_URL, url,re.VERBOSE)
4050         videoName=m.group('name')
4051         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4052         # If the url includes the language we get the title translated
4053         title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>'
4054         title=re.search(title_RE, webpage).group('title')
4055         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4056                         "id":(?P<videoID>[\d]+).*?
4057                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4058         info_match=re.search(info_RE,webpage,re.VERBOSE)
4059         video_id=info_match.group('videoID')
4060         mediaSlug=info_match.group('mediaSlug')
4061         video_url=self._talk_video_link(mediaSlug)
4062         info = {
4063                 'id': video_id,
4064                 'url': video_url,
4065                 'ext': 'mp4',
4066                 'title': title
4067                 }
4068         return info
4069
4070 class MySpassIE(InfoExtractor):
4071     _VALID_URL = r'http://www.myspass.de/.*'
4072
4073     def _real_extract(self, url):
4074         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4075
4076         # video id is the last path element of the URL
4077         # usually there is a trailing slash, so also try the second but last
4078         url_path = compat_urllib_parse_urlparse(url).path
4079         url_parent_path, video_id = os.path.split(url_path)
4080         if not video_id:
4081             _, video_id = os.path.split(url_parent_path)
4082
4083         # get metadata
4084         metadata_url = META_DATA_URL_TEMPLATE % video_id
4085         metadata_text = self._download_webpage(metadata_url, video_id)
4086         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4087
4088         # extract values from metadata
4089         url_flv_el = metadata.find('url_flv')
4090         if url_flv_el is None:
4091             self._downloader.trouble(u'ERROR: unable to extract download url')
4092             return
4093         video_url = url_flv_el.text
4094         extension = os.path.splitext(video_url)[1][1:]
4095         title_el = metadata.find('title')
4096         if title_el is None:
4097             self._downloader.trouble(u'ERROR: unable to extract title')
4098             return
4099         title = title_el.text
4100         format_id_el = metadata.find('format_id')
4101         if format_id_el is None:
4102             format = ext
4103         else:
4104             format = format_id_el.text
4105         description_el = metadata.find('description')
4106         if description_el is not None:
4107             description = description_el.text
4108         else:
4109             description = None
4110         imagePreview_el = metadata.find('imagePreview')
4111         if imagePreview_el is not None:
4112             thumbnail = imagePreview_el.text
4113         else:
4114             thumbnail = None
4115         info = {
4116             'id': video_id,
4117             'url': video_url,
4118             'title': title,
4119             'ext': extension,
4120             'format': format,
4121             'thumbnail': thumbnail,
4122             'description': description
4123         }
4124         return [info]
4125
4126 def gen_extractors():
4127     """ Return a list of an instance of every supported extractor.
4128     The order does matter; the first extractor matched is the one handling the URL.
4129     """
4130     return [
4131         YoutubePlaylistIE(),
4132         YoutubeChannelIE(),
4133         YoutubeUserIE(),
4134         YoutubeSearchIE(),
4135         YoutubeIE(),
4136         MetacafeIE(),
4137         DailymotionIE(),
4138         GoogleSearchIE(),
4139         PhotobucketIE(),
4140         YahooIE(),
4141         YahooSearchIE(),
4142         DepositFilesIE(),
4143         FacebookIE(),
4144         BlipTVUserIE(),
4145         BlipTVIE(),
4146         VimeoIE(),
4147         MyVideoIE(),
4148         ComedyCentralIE(),
4149         EscapistIE(),
4150         CollegeHumorIE(),
4151         XVideosIE(),
4152         SoundcloudIE(),
4153         InfoQIE(),
4154         MixcloudIE(),
4155         StanfordOpenClassroomIE(),
4156         MTVIE(),
4157         YoukuIE(),
4158         XNXXIE(),
4159         YouJizzIE(),
4160         PornotubeIE(),
4161         YouPornIE(),
4162         GooglePlusIE(),
4163         ArteTvIE(),
4164         NBAIE(),
4165         JustinTVIE(),
4166         FunnyOrDieIE(),
4167         TweetReelIE(),
4168         SteamIE(),
4169         UstreamIE(),
4170         RBMARadioIE(),
4171         EightTracksIE(),
4172         KeekIE(),
4173         TEDIE(),
4174         MySpassIE(),
4175         GenericIE()
4176     ]
4177
4178