youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 195         """
 196         Perform a regex search on the given string, using a single or a list of
 197         patterns returning the first matching group.
 198         In case of failure return a default value or raise a WARNING or a
 199         ExtractorError, depending on fatal, specifying the field name.
 200         """
 201         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 202             mobj = re.search(pattern, string, flags)
 203         else:
 204             for p in pattern:
 205                 mobj = re.search(p, string, flags)
 206                 if mobj: break
 207
 208         if sys.stderr.isatty() and os.name != 'nt':
 209             _name = u'\033[0;34m%s\033[0m' % name
 210         else:
 211             _name = name
 212
 213         if mobj:
 214             # return the first matching group
 215             return next(g for g in mobj.groups() if g is not None)
 216         elif default is not None:
 217             return default
 218         elif fatal:
 219             raise ExtractorError(u'Unable to extract %s' % _name)
 220         else:
 221             self._downloader.report_warning(u'unable to extract %s; '
 222                 u'please report this issue on GitHub.' % _name)
 223             return None
 224
 225     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 226         """
 227         Like _search_regex, but strips HTML tags and unescapes entities.
 228         """
 229         res = self._search_regex(pattern, string, name, default, fatal, flags)
 230         if res:
 231             return clean_html(res).strip()
 232         else:
 233             return res
 234
 235 class SearchInfoExtractor(InfoExtractor):
 236     """
 237     Base class for paged search queries extractors.
 238     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 239     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 240     """
 241
 242     @classmethod
 243     def _make_valid_url(cls):
 244         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 245
 246     @classmethod
 247     def suitable(cls, url):
 248         return re.match(cls._make_valid_url(), url) is not None
 249
 250     def _real_extract(self, query):
 251         mobj = re.match(self._make_valid_url(), query)
 252         if mobj is None:
 253             raise ExtractorError(u'Invalid search query "%s"' % query)
 254
 255         prefix = mobj.group('prefix')
 256         query = mobj.group('query')
 257         if prefix == '':
 258             return self._get_n_results(query, 1)
 259         elif prefix == 'all':
 260             return self._get_n_results(query, self._MAX_RESULTS)
 261         else:
 262             n = int(prefix)
 263             if n <= 0:
 264                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 265             elif n > self._MAX_RESULTS:
 266                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 267                 n = self._MAX_RESULTS
 268             return self._get_n_results(query, n)
 269
 270     def _get_n_results(self, query, n):
 271         """Get a specified number of results for a query"""
 272         raise NotImplementedError("This method must be implemented by sublclasses")
 273
 274
 275 class YoutubeIE(InfoExtractor):
 276     """Information extractor for youtube.com."""
 277
 278     _VALID_URL = r"""^
 279                      (
 280                          (?:https?://)?                                       # http(s):// (optional)
 281                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 282                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 283                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 284                          (?:                                                  # the various things that can precede the ID:
 285                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 286                              |(?:                                             # or the v= param in all its forms
 287                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 288                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 289                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 290                                  v=
 291                              )
 292                          )?                                                   # optional -> youtube.com/xxxx is OK
 293                      )?                                                       # all until now is optional -> you can pass the naked ID
 294                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 295                      (?(1).+)?                                                # if we found the ID, everything can follow
 296                      $"""
 297     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 298     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 299     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 300     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 301     _NETRC_MACHINE = 'youtube'
 302     # Listed in order of quality
 303     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 304     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 305     _video_extensions = {
 306         '13': '3gp',
 307         '17': 'mp4',
 308         '18': 'mp4',
 309         '22': 'mp4',
 310         '37': 'mp4',
 311         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 312         '43': 'webm',
 313         '44': 'webm',
 314         '45': 'webm',
 315         '46': 'webm',
 316     }
 317     _video_dimensions = {
 318         '5': '240x400',
 319         '6': '???',
 320         '13': '???',
 321         '17': '144x176',
 322         '18': '360x640',
 323         '22': '720x1280',
 324         '34': '360x640',
 325         '35': '480x854',
 326         '37': '1080x1920',
 327         '38': '3072x4096',
 328         '43': '360x640',
 329         '44': '480x854',
 330         '45': '720x1280',
 331         '46': '1080x1920',
 332     }
 333     IE_NAME = u'youtube'
 334
 335     @classmethod
 336     def suitable(cls, url):
 337         """Receives a URL and returns True if suitable for this IE."""
 338         if YoutubePlaylistIE.suitable(url): return False
 339         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 340
 341     def report_lang(self):
 342         """Report attempt to set language."""
 343         self.to_screen(u'Setting language')
 344
 345     def report_login(self):
 346         """Report attempt to log in."""
 347         self.to_screen(u'Logging in')
 348
 349     def report_video_webpage_download(self, video_id):
 350         """Report attempt to download video webpage."""
 351         self.to_screen(u'%s: Downloading video webpage' % video_id)
 352
 353     def report_video_info_webpage_download(self, video_id):
 354         """Report attempt to download video info webpage."""
 355         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 356
 357     def report_video_subtitles_download(self, video_id):
 358         """Report attempt to download video info webpage."""
 359         self.to_screen(u'%s: Checking available subtitles' % video_id)
 360
 361     def report_video_subtitles_request(self, video_id, sub_lang, format):
 362         """Report attempt to download video info webpage."""
 363         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 364
 365     def report_video_subtitles_available(self, video_id, sub_lang_list):
 366         """Report available subtitles."""
 367         sub_lang = ",".join(list(sub_lang_list.keys()))
 368         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 369
 370     def report_information_extraction(self, video_id):
 371         """Report attempt to extract video information."""
 372         self.to_screen(u'%s: Extracting video information' % video_id)
 373
 374     def report_unavailable_format(self, video_id, format):
 375         """Report extracted video URL."""
 376         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 377
 378     def report_rtmp_download(self):
 379         """Indicate the download will use the RTMP protocol."""
 380         self.to_screen(u'RTMP download detected')
 381
 382     def _get_available_subtitles(self, video_id):
 383         self.report_video_subtitles_download(video_id)
 384         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 385         try:
 386             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 388             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 389         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 390         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 391         if not sub_lang_list:
 392             return (u'video doesn\'t have subtitles', None)
 393         return sub_lang_list
 394
 395     def _list_available_subtitles(self, video_id):
 396         sub_lang_list = self._get_available_subtitles(video_id)
 397         self.report_video_subtitles_available(video_id, sub_lang_list)
 398
 399     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 400         """
 401         Return tuple:
 402         (error_message, sub_lang, sub)
 403         """
 404         self.report_video_subtitles_request(video_id, sub_lang, format)
 405         params = compat_urllib_parse.urlencode({
 406             'lang': sub_lang,
 407             'name': sub_name,
 408             'v': video_id,
 409             'fmt': format,
 410         })
 411         url = 'http://www.youtube.com/api/timedtext?' + params
 412         try:
 413             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 416         if not sub:
 417             return (u'Did not fetch video subtitles', None, None)
 418         return (None, sub_lang, sub)
 419
 420     def _request_automatic_caption(self, video_id, webpage):
 421         """We need the webpage for getting the captions url, pass it as an
 422            argument to speed up the process."""
 423         sub_lang = self._downloader.params.get('subtitleslang')
 424         sub_format = self._downloader.params.get('subtitlesformat')
 425         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 426         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 427         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 428         if mobj is None:
 429             return [(err_msg, None, None)]
 430         player_config = json.loads(mobj.group(1))
 431         try:
 432             args = player_config[u'args']
 433             caption_url = args[u'ttsurl']
 434             timestamp = args[u'timestamp']
 435             params = compat_urllib_parse.urlencode({
 436                 'lang': 'en',
 437                 'tlang': sub_lang,
 438                 'fmt': sub_format,
 439                 'ts': timestamp,
 440                 'kind': 'asr',
 441             })
 442             subtitles_url = caption_url + '&' + params
 443             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 444             return [(None, sub_lang, sub)]
 445         except KeyError:
 446             return [(err_msg, None, None)]
 447
 448     def _extract_subtitle(self, video_id):
 449         """
 450         Return a list with a tuple:
 451         [(error_message, sub_lang, sub)]
 452         """
 453         sub_lang_list = self._get_available_subtitles(video_id)
 454         sub_format = self._downloader.params.get('subtitlesformat')
 455         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 456             return [(sub_lang_list[0], None, None)]
 457         if self._downloader.params.get('subtitleslang', False):
 458             sub_lang = self._downloader.params.get('subtitleslang')
 459         elif 'en' in sub_lang_list:
 460             sub_lang = 'en'
 461         else:
 462             sub_lang = list(sub_lang_list.keys())[0]
 463         if not sub_lang in sub_lang_list:
 464             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 465
 466         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 467         return [subtitle]
 468
 469     def _extract_all_subtitles(self, video_id):
 470         sub_lang_list = self._get_available_subtitles(video_id)
 471         sub_format = self._downloader.params.get('subtitlesformat')
 472         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 473             return [(sub_lang_list[0], None, None)]
 474         subtitles = []
 475         for sub_lang in sub_lang_list:
 476             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 477             subtitles.append(subtitle)
 478         return subtitles
 479
 480     def _print_formats(self, formats):
 481         print('Available formats:')
 482         for x in formats:
 483             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 484
 485     def _real_initialize(self):
 486         if self._downloader is None:
 487             return
 488
 489         username = None
 490         password = None
 491         downloader_params = self._downloader.params
 492
 493         # Attempt to use provided username and password or .netrc data
 494         if downloader_params.get('username', None) is not None:
 495             username = downloader_params['username']
 496             password = downloader_params['password']
 497         elif downloader_params.get('usenetrc', False):
 498             try:
 499                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 500                 if info is not None:
 501                     username = info[0]
 502                     password = info[2]
 503                 else:
 504                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 505             except (IOError, netrc.NetrcParseError) as err:
 506                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 507                 return
 508
 509         # Set language
 510         request = compat_urllib_request.Request(self._LANG_URL)
 511         try:
 512             self.report_lang()
 513             compat_urllib_request.urlopen(request).read()
 514         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 515             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 516             return
 517
 518         # No authentication to be performed
 519         if username is None:
 520             return
 521
 522         request = compat_urllib_request.Request(self._LOGIN_URL)
 523         try:
 524             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 525         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 526             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 527             return
 528
 529         galx = None
 530         dsh = None
 531         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 532         if match:
 533           galx = match.group(1)
 534
 535         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 536         if match:
 537           dsh = match.group(1)
 538
 539         # Log in
 540         login_form_strs = {
 541                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 542                 u'Email': username,
 543                 u'GALX': galx,
 544                 u'Passwd': password,
 545                 u'PersistentCookie': u'yes',
 546                 u'_utf8': u'霱',
 547                 u'bgresponse': u'js_disabled',
 548                 u'checkConnection': u'',
 549                 u'checkedDomains': u'youtube',
 550                 u'dnConn': u'',
 551                 u'dsh': dsh,
 552                 u'pstMsg': u'0',
 553                 u'rmShown': u'1',
 554                 u'secTok': u'',
 555                 u'signIn': u'Sign in',
 556                 u'timeStmp': u'',
 557                 u'service': u'youtube',
 558                 u'uilel': u'3',
 559                 u'hl': u'en_US',
 560         }
 561         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 562         # chokes on unicode
 563         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 564         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 565         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 566         try:
 567             self.report_login()
 568             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 569             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 570                 self._downloader.report_warning(u'unable to log in: bad username or password')
 571                 return
 572         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 573             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 574             return
 575
 576         # Confirm age
 577         age_form = {
 578                 'next_url':     '/',
 579                 'action_confirm':   'Confirm',
 580                 }
 581         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 582         try:
 583             self.report_age_confirmation()
 584             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 585         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 586             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 587
 588     def _extract_id(self, url):
 589         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 590         if mobj is None:
 591             raise ExtractorError(u'Invalid URL: %s' % url)
 592         video_id = mobj.group(2)
 593         return video_id
 594
 595     def _real_extract(self, url):
 596         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 597         mobj = re.search(self._NEXT_URL_RE, url)
 598         if mobj:
 599             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 600         video_id = self._extract_id(url)
 601
 602         # Get video webpage
 603         self.report_video_webpage_download(video_id)
 604         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 605         request = compat_urllib_request.Request(url)
 606         try:
 607             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 608         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 609             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 610
 611         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 612
 613         # Attempt to extract SWF player URL
 614         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 615         if mobj is not None:
 616             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 617         else:
 618             player_url = None
 619
 620         # Get video info
 621         self.report_video_info_webpage_download(video_id)
 622         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 623             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 624                     % (video_id, el_type))
 625             video_info_webpage = self._download_webpage(video_info_url, video_id,
 626                                     note=False,
 627                                     errnote='unable to download video info webpage')
 628             video_info = compat_parse_qs(video_info_webpage)
 629             if 'token' in video_info:
 630                 break
 631         if 'token' not in video_info:
 632             if 'reason' in video_info:
 633                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 634             else:
 635                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 636
 637         # Check for "rental" videos
 638         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 639             raise ExtractorError(u'"rental" videos not supported')
 640
 641         # Start extracting information
 642         self.report_information_extraction(video_id)
 643
 644         # uploader
 645         if 'author' not in video_info:
 646             raise ExtractorError(u'Unable to extract uploader name')
 647         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 648
 649         # uploader_id
 650         video_uploader_id = None
 651         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 652         if mobj is not None:
 653             video_uploader_id = mobj.group(1)
 654         else:
 655             self._downloader.report_warning(u'unable to extract uploader nickname')
 656
 657         # title
 658         if 'title' not in video_info:
 659             raise ExtractorError(u'Unable to extract video title')
 660         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 661
 662         # thumbnail image
 663         if 'thumbnail_url' not in video_info:
 664             self._downloader.report_warning(u'unable to extract video thumbnail')
 665             video_thumbnail = ''
 666         else:   # don't panic if we can't find it
 667             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 668
 669         # upload date
 670         upload_date = None
 671         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 672         if mobj is not None:
 673             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 674             upload_date = unified_strdate(upload_date)
 675
 676         # description
 677         video_description = get_element_by_id("eow-description", video_webpage)
 678         if video_description:
 679             video_description = clean_html(video_description)
 680         else:
 681             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 682             if fd_mobj:
 683                 video_description = unescapeHTML(fd_mobj.group(1))
 684             else:
 685                 video_description = u''
 686
 687         # subtitles
 688         video_subtitles = None
 689
 690         if self._downloader.params.get('writesubtitles', False):
 691             video_subtitles = self._extract_subtitle(video_id)
 692             if video_subtitles:
 693                 (sub_error, sub_lang, sub) = video_subtitles[0]
 694                 if sub_error:
 695                     # We try with the automatic captions
 696                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 697                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 698                     if sub is not None:
 699                         pass
 700                     else:
 701                         # We report the original error
 702                         self._downloader.report_error(sub_error)
 703
 704         if self._downloader.params.get('allsubtitles', False):
 705             video_subtitles = self._extract_all_subtitles(video_id)
 706             for video_subtitle in video_subtitles:
 707                 (sub_error, sub_lang, sub) = video_subtitle
 708                 if sub_error:
 709                     self._downloader.report_error(sub_error)
 710
 711         if self._downloader.params.get('listsubtitles', False):
 712             sub_lang_list = self._list_available_subtitles(video_id)
 713             return
 714
 715         if 'length_seconds' not in video_info:
 716             self._downloader.report_warning(u'unable to extract video duration')
 717             video_duration = ''
 718         else:
 719             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 720
 721         # token
 722         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 723
 724         # Decide which formats to download
 725         req_format = self._downloader.params.get('format', None)
 726
 727         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 728             self.report_rtmp_download()
 729             video_url_list = [(None, video_info['conn'][0])]
 730         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 731             url_map = {}
 732             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 733                 url_data = compat_parse_qs(url_data_str)
 734                 if 'itag' in url_data and 'url' in url_data:
 735                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 736                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 737                     url_map[url_data['itag'][0]] = url
 738
 739             format_limit = self._downloader.params.get('format_limit', None)
 740             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 741             if format_limit is not None and format_limit in available_formats:
 742                 format_list = available_formats[available_formats.index(format_limit):]
 743             else:
 744                 format_list = available_formats
 745             existing_formats = [x for x in format_list if x in url_map]
 746             if len(existing_formats) == 0:
 747                 raise ExtractorError(u'no known formats available for video')
 748             if self._downloader.params.get('listformats', None):
 749                 self._print_formats(existing_formats)
 750                 return
 751             if req_format is None or req_format == 'best':
 752                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 753             elif req_format == 'worst':
 754                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 755             elif req_format in ('-1', 'all'):
 756                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 757             else:
 758                 # Specific formats. We pick the first in a slash-delimeted sequence.
 759                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 760                 req_formats = req_format.split('/')
 761                 video_url_list = None
 762                 for rf in req_formats:
 763                     if rf in url_map:
 764                         video_url_list = [(rf, url_map[rf])]
 765                         break
 766                 if video_url_list is None:
 767                     raise ExtractorError(u'requested format not available')
 768         else:
 769             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 770
 771         results = []
 772         for format_param, video_real_url in video_url_list:
 773             # Extension
 774             video_extension = self._video_extensions.get(format_param, 'flv')
 775
 776             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 777                                               self._video_dimensions.get(format_param, '???'))
 778
 779             results.append({
 780                 'id':       video_id,
 781                 'url':      video_real_url,
 782                 'uploader': video_uploader,
 783                 'uploader_id': video_uploader_id,
 784                 'upload_date':  upload_date,
 785                 'title':    video_title,
 786                 'ext':      video_extension,
 787                 'format':   video_format,
 788                 'thumbnail':    video_thumbnail,
 789                 'description':  video_description,
 790                 'player_url':   player_url,
 791                 'subtitles':    video_subtitles,
 792                 'duration':     video_duration
 793             })
 794         return results
 795
 796
 797 class MetacafeIE(InfoExtractor):
 798     """Information Extractor for metacafe.com."""
 799
 800     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 801     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 802     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 803     IE_NAME = u'metacafe'
 804
 805     def report_disclaimer(self):
 806         """Report disclaimer retrieval."""
 807         self.to_screen(u'Retrieving disclaimer')
 808
 809     def _real_initialize(self):
 810         # Retrieve disclaimer
 811         request = compat_urllib_request.Request(self._DISCLAIMER)
 812         try:
 813             self.report_disclaimer()
 814             disclaimer = compat_urllib_request.urlopen(request).read()
 815         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 816             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 817
 818         # Confirm age
 819         disclaimer_form = {
 820             'filters': '0',
 821             'submit': "Continue - I'm over 18",
 822             }
 823         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 824         try:
 825             self.report_age_confirmation()
 826             disclaimer = compat_urllib_request.urlopen(request).read()
 827         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 828             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 829
 830     def _real_extract(self, url):
 831         # Extract id and simplified title from URL
 832         mobj = re.match(self._VALID_URL, url)
 833         if mobj is None:
 834             raise ExtractorError(u'Invalid URL: %s' % url)
 835
 836         video_id = mobj.group(1)
 837
 838         # Check if video comes from YouTube
 839         mobj2 = re.match(r'^yt-(.*)$', video_id)
 840         if mobj2 is not None:
 841             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 842
 843         # Retrieve video webpage to extract further information
 844         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 845
 846         # Extract URL, uploader and title from webpage
 847         self.report_extraction(video_id)
 848         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 849         if mobj is not None:
 850             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 851             video_extension = mediaURL[-3:]
 852
 853             # Extract gdaKey if available
 854             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 855             if mobj is None:
 856                 video_url = mediaURL
 857             else:
 858                 gdaKey = mobj.group(1)
 859                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 860         else:
 861             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 862             if mobj is None:
 863                 raise ExtractorError(u'Unable to extract media URL')
 864             vardict = compat_parse_qs(mobj.group(1))
 865             if 'mediaData' not in vardict:
 866                 raise ExtractorError(u'Unable to extract media URL')
 867             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 868             if mobj is None:
 869                 raise ExtractorError(u'Unable to extract media URL')
 870             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 871             video_extension = mediaURL[-3:]
 872             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 873
 874         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 875         if mobj is None:
 876             raise ExtractorError(u'Unable to extract title')
 877         video_title = mobj.group(1).decode('utf-8')
 878
 879         mobj = re.search(r'submitter=(.*?);', webpage)
 880         if mobj is None:
 881             raise ExtractorError(u'Unable to extract uploader nickname')
 882         video_uploader = mobj.group(1)
 883
 884         return [{
 885             'id':       video_id.decode('utf-8'),
 886             'url':      video_url.decode('utf-8'),
 887             'uploader': video_uploader.decode('utf-8'),
 888             'upload_date':  None,
 889             'title':    video_title,
 890             'ext':      video_extension.decode('utf-8'),
 891         }]
 892
 893 class DailymotionIE(InfoExtractor):
 894     """Information Extractor for Dailymotion"""
 895
 896     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 897     IE_NAME = u'dailymotion'
 898
 899     def _real_extract(self, url):
 900         # Extract id and simplified title from URL
 901         mobj = re.match(self._VALID_URL, url)
 902         if mobj is None:
 903             raise ExtractorError(u'Invalid URL: %s' % url)
 904
 905         video_id = mobj.group(1).split('_')[0].split('?')[0]
 906
 907         video_extension = 'mp4'
 908
 909         # Retrieve video webpage to extract further information
 910         request = compat_urllib_request.Request(url)
 911         request.add_header('Cookie', 'family_filter=off')
 912         webpage = self._download_webpage(request, video_id)
 913
 914         # Extract URL, uploader and title from webpage
 915         self.report_extraction(video_id)
 916         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 917         if mobj is None:
 918             raise ExtractorError(u'Unable to extract media URL')
 919         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 920
 921         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 922             if key in flashvars:
 923                 max_quality = key
 924                 self.to_screen(u'Using %s' % key)
 925                 break
 926         else:
 927             raise ExtractorError(u'Unable to extract video URL')
 928
 929         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 930         if mobj is None:
 931             raise ExtractorError(u'Unable to extract video URL')
 932
 933         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 934
 935         # TODO: support choosing qualities
 936
 937         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 938         if mobj is None:
 939             raise ExtractorError(u'Unable to extract title')
 940         video_title = unescapeHTML(mobj.group('title'))
 941
 942         video_uploader = None
 943         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 944         if mobj is None:
 945             # lookin for official user
 946             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 947             if mobj_official is None:
 948                 self._downloader.report_warning(u'unable to extract uploader nickname')
 949             else:
 950                 video_uploader = mobj_official.group(1)
 951         else:
 952             video_uploader = mobj.group(1)
 953
 954         video_upload_date = None
 955         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 956         if mobj is not None:
 957             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 958
 959         return [{
 960             'id':       video_id,
 961             'url':      video_url,
 962             'uploader': video_uploader,
 963             'upload_date':  video_upload_date,
 964             'title':    video_title,
 965             'ext':      video_extension,
 966         }]
 967
 968
 969 class PhotobucketIE(InfoExtractor):
 970     """Information extractor for photobucket.com."""
 971
 972     # TODO: the original _VALID_URL was:
 973     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 974     # Check if it's necessary to keep the old extracion process
 975     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 976     IE_NAME = u'photobucket'
 977
 978     def _real_extract(self, url):
 979         # Extract id from URL
 980         mobj = re.match(self._VALID_URL, url)
 981         if mobj is None:
 982             raise ExtractorError(u'Invalid URL: %s' % url)
 983
 984         video_id = mobj.group('id')
 985
 986         video_extension = mobj.group('ext')
 987
 988         # Retrieve video webpage to extract further information
 989         webpage = self._download_webpage(url, video_id)
 990
 991         # Extract URL, uploader, and title from webpage
 992         self.report_extraction(video_id)
 993         # We try first by looking the javascript code:
 994         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 995         if mobj is not None:
 996             info = json.loads(mobj.group('json'))
 997             return [{
 998                 'id':       video_id,
 999                 'url':      info[u'downloadUrl'],
1000                 'uploader': info[u'username'],
1001                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
1002                 'title':    info[u'title'],
1003                 'ext':      video_extension,
1004                 'thumbnail': info[u'thumbUrl'],
1005             }]
1006
1007         # We try looking in other parts of the webpage
1008         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1009             webpage, u'video URL')
1010
1011         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1012         if mobj is None:
1013             raise ExtractorError(u'Unable to extract title')
1014         video_title = mobj.group(1).decode('utf-8')
1015         video_uploader = mobj.group(2).decode('utf-8')
1016
1017         return [{
1018             'id':       video_id.decode('utf-8'),
1019             'url':      video_url.decode('utf-8'),
1020             'uploader': video_uploader,
1021             'upload_date':  None,
1022             'title':    video_title,
1023             'ext':      video_extension.decode('utf-8'),
1024         }]
1025
1026
1027 class YahooIE(InfoExtractor):
1028     """Information extractor for screen.yahoo.com."""
1029     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1030
1031     def _real_extract(self, url):
1032         mobj = re.match(self._VALID_URL, url)
1033         if mobj is None:
1034             raise ExtractorError(u'Invalid URL: %s' % url)
1035         video_id = mobj.group('id')
1036         webpage = self._download_webpage(url, video_id)
1037         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1038
1039         if m_id is None:
1040             # TODO: Check which url parameters are required
1041             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1042             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1043             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1044                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1045                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1046                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1047                         '''
1048             self.report_extraction(video_id)
1049             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1050             if m_info is None:
1051                 raise ExtractorError(u'Unable to extract video info')
1052             video_title = m_info.group('title')
1053             video_description = m_info.group('description')
1054             video_thumb = m_info.group('thumb')
1055             video_date = m_info.group('date')
1056             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1057
1058             # TODO: Find a way to get mp4 videos
1059             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1060             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1061             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1062             video_url = m_rest.group('url')
1063             video_path = m_rest.group('path')
1064             if m_rest is None:
1065                 raise ExtractorError(u'Unable to extract video url')
1066
1067         else: # We have to use a different method if another id is defined
1068             long_id = m_id.group('new_id')
1069             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1070             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1071             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1072             info = json.loads(json_str)
1073             res = info[u'query'][u'results'][u'mediaObj'][0]
1074             stream = res[u'streams'][0]
1075             video_path = stream[u'path']
1076             video_url = stream[u'host']
1077             meta = res[u'meta']
1078             video_title = meta[u'title']
1079             video_description = meta[u'description']
1080             video_thumb = meta[u'thumbnail']
1081             video_date = None # I can't find it
1082
1083         info_dict = {
1084                      'id': video_id,
1085                      'url': video_url,
1086                      'play_path': video_path,
1087                      'title':video_title,
1088                      'description': video_description,
1089                      'thumbnail': video_thumb,
1090                      'upload_date': video_date,
1091                      'ext': 'flv',
1092                      }
1093         return info_dict
1094
1095 class VimeoIE(InfoExtractor):
1096     """Information extractor for vimeo.com."""
1097
1098     # _VALID_URL matches Vimeo URLs
1099     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1100     IE_NAME = u'vimeo'
1101
1102     def _real_extract(self, url, new_video=True):
1103         # Extract ID from URL
1104         mobj = re.match(self._VALID_URL, url)
1105         if mobj is None:
1106             raise ExtractorError(u'Invalid URL: %s' % url)
1107
1108         video_id = mobj.group('id')
1109         if not mobj.group('proto'):
1110             url = 'https://' + url
1111         if mobj.group('direct_link') or mobj.group('pro'):
1112             url = 'https://vimeo.com/' + video_id
1113
1114         # Retrieve video webpage to extract further information
1115         request = compat_urllib_request.Request(url, None, std_headers)
1116         webpage = self._download_webpage(request, video_id)
1117
1118         # Now we begin extracting as much information as we can from what we
1119         # retrieved. First we extract the information common to all extractors,
1120         # and latter we extract those that are Vimeo specific.
1121         self.report_extraction(video_id)
1122
1123         # Extract the config JSON
1124         try:
1125             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1126             config = json.loads(config)
1127         except:
1128             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1129                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1130             else:
1131                 raise ExtractorError(u'Unable to extract info section')
1132
1133         # Extract title
1134         video_title = config["video"]["title"]
1135
1136         # Extract uploader and uploader_id
1137         video_uploader = config["video"]["owner"]["name"]
1138         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1139
1140         # Extract video thumbnail
1141         video_thumbnail = config["video"]["thumbnail"]
1142
1143         # Extract video description
1144         video_description = get_element_by_attribute("itemprop", "description", webpage)
1145         if video_description: video_description = clean_html(video_description)
1146         else: video_description = u''
1147
1148         # Extract upload date
1149         video_upload_date = None
1150         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1151         if mobj is not None:
1152             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1153
1154         # Vimeo specific: extract request signature and timestamp
1155         sig = config['request']['signature']
1156         timestamp = config['request']['timestamp']
1157
1158         # Vimeo specific: extract video codec and quality information
1159         # First consider quality, then codecs, then take everything
1160         # TODO bind to format param
1161         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1162         files = { 'hd': [], 'sd': [], 'other': []}
1163         for codec_name, codec_extension in codecs:
1164             if codec_name in config["video"]["files"]:
1165                 if 'hd' in config["video"]["files"][codec_name]:
1166                     files['hd'].append((codec_name, codec_extension, 'hd'))
1167                 elif 'sd' in config["video"]["files"][codec_name]:
1168                     files['sd'].append((codec_name, codec_extension, 'sd'))
1169                 else:
1170                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1171
1172         for quality in ('hd', 'sd', 'other'):
1173             if len(files[quality]) > 0:
1174                 video_quality = files[quality][0][2]
1175                 video_codec = files[quality][0][0]
1176                 video_extension = files[quality][0][1]
1177                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1178                 break
1179         else:
1180             raise ExtractorError(u'No known codec found')
1181
1182         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1183                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1184
1185         return [{
1186             'id':       video_id,
1187             'url':      video_url,
1188             'uploader': video_uploader,
1189             'uploader_id': video_uploader_id,
1190             'upload_date':  video_upload_date,
1191             'title':    video_title,
1192             'ext':      video_extension,
1193             'thumbnail':    video_thumbnail,
1194             'description':  video_description,
1195         }]
1196
1197
1198 class ArteTvIE(InfoExtractor):
1199     """arte.tv information extractor."""
1200
1201     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1202     _LIVE_URL = r'index-[0-9]+\.html$'
1203
1204     IE_NAME = u'arte.tv'
1205
1206     def fetch_webpage(self, url):
1207         request = compat_urllib_request.Request(url)
1208         try:
1209             self.report_download_webpage(url)
1210             webpage = compat_urllib_request.urlopen(request).read()
1211         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1212             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1213         except ValueError as err:
1214             raise ExtractorError(u'Invalid URL: %s' % url)
1215         return webpage
1216
1217     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1218         page = self.fetch_webpage(url)
1219         mobj = re.search(regex, page, regexFlags)
1220         info = {}
1221
1222         if mobj is None:
1223             raise ExtractorError(u'Invalid URL: %s' % url)
1224
1225         for (i, key, err) in matchTuples:
1226             if mobj.group(i) is None:
1227                 raise ExtractorError(err)
1228             else:
1229                 info[key] = mobj.group(i)
1230
1231         return info
1232
1233     def extractLiveStream(self, url):
1234         video_lang = url.split('/')[-4]
1235         info = self.grep_webpage(
1236             url,
1237             r'src="(.*?/videothek_js.*?\.js)',
1238             0,
1239             [
1240                 (1, 'url', u'Invalid URL: %s' % url)
1241             ]
1242         )
1243         http_host = url.split('/')[2]
1244         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1245         info = self.grep_webpage(
1246             next_url,
1247             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1248                 '(http://.*?\.swf).*?' +
1249                 '(rtmp://.*?)\'',
1250             re.DOTALL,
1251             [
1252                 (1, 'path',   u'could not extract video path: %s' % url),
1253                 (2, 'player', u'could not extract video player: %s' % url),
1254                 (3, 'url',    u'could not extract video url: %s' % url)
1255             ]
1256         )
1257         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1258
1259     def extractPlus7Stream(self, url):
1260         video_lang = url.split('/')[-3]
1261         info = self.grep_webpage(
1262             url,
1263             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1264             0,
1265             [
1266                 (1, 'url', u'Invalid URL: %s' % url)
1267             ]
1268         )
1269         next_url = compat_urllib_parse.unquote(info.get('url'))
1270         info = self.grep_webpage(
1271             next_url,
1272             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1273             0,
1274             [
1275                 (1, 'url', u'Could not find <video> tag: %s' % url)
1276             ]
1277         )
1278         next_url = compat_urllib_parse.unquote(info.get('url'))
1279
1280         info = self.grep_webpage(
1281             next_url,
1282             r'<video id="(.*?)".*?>.*?' +
1283                 '<name>(.*?)</name>.*?' +
1284                 '<dateVideo>(.*?)</dateVideo>.*?' +
1285                 '<url quality="hd">(.*?)</url>',
1286             re.DOTALL,
1287             [
1288                 (1, 'id',    u'could not extract video id: %s' % url),
1289                 (2, 'title', u'could not extract video title: %s' % url),
1290                 (3, 'date',  u'could not extract video date: %s' % url),
1291                 (4, 'url',   u'could not extract video url: %s' % url)
1292             ]
1293         )
1294
1295         return {
1296             'id':           info.get('id'),
1297             'url':          compat_urllib_parse.unquote(info.get('url')),
1298             'uploader':     u'arte.tv',
1299             'upload_date':  unified_strdate(info.get('date')),
1300             'title':        info.get('title').decode('utf-8'),
1301             'ext':          u'mp4',
1302             'format':       u'NA',
1303             'player_url':   None,
1304         }
1305
1306     def _real_extract(self, url):
1307         video_id = url.split('/')[-1]
1308         self.report_extraction(video_id)
1309
1310         if re.search(self._LIVE_URL, video_id) is not None:
1311             self.extractLiveStream(url)
1312             return
1313         else:
1314             info = self.extractPlus7Stream(url)
1315
1316         return [info]
1317
1318
1319 class GenericIE(InfoExtractor):
1320     """Generic last-resort information extractor."""
1321
1322     _VALID_URL = r'.*'
1323     IE_NAME = u'generic'
1324
1325     def report_download_webpage(self, video_id):
1326         """Report webpage download."""
1327         if not self._downloader.params.get('test', False):
1328             self._downloader.report_warning(u'Falling back on generic information extractor.')
1329         super(GenericIE, self).report_download_webpage(video_id)
1330
1331     def report_following_redirect(self, new_url):
1332         """Report information extraction."""
1333         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1334
1335     def _test_redirect(self, url):
1336         """Check if it is a redirect, like url shorteners, in case return the new url."""
1337         class HeadRequest(compat_urllib_request.Request):
1338             def get_method(self):
1339                 return "HEAD"
1340
1341         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1342             """
1343             Subclass the HTTPRedirectHandler to make it use our
1344             HeadRequest also on the redirected URL
1345             """
1346             def redirect_request(self, req, fp, code, msg, headers, newurl):
1347                 if code in (301, 302, 303, 307):
1348                     newurl = newurl.replace(' ', '%20')
1349                     newheaders = dict((k,v) for k,v in req.headers.items()
1350                                       if k.lower() not in ("content-length", "content-type"))
1351                     return HeadRequest(newurl,
1352                                        headers=newheaders,
1353                                        origin_req_host=req.get_origin_req_host(),
1354                                        unverifiable=True)
1355                 else:
1356                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1357
1358         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1359             """
1360             Fallback to GET if HEAD is not allowed (405 HTTP error)
1361             """
1362             def http_error_405(self, req, fp, code, msg, headers):
1363                 fp.read()
1364                 fp.close()
1365
1366                 newheaders = dict((k,v) for k,v in req.headers.items()
1367                                   if k.lower() not in ("content-length", "content-type"))
1368                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1369                                                  headers=newheaders,
1370                                                  origin_req_host=req.get_origin_req_host(),
1371                                                  unverifiable=True))
1372
1373         # Build our opener
1374         opener = compat_urllib_request.OpenerDirector()
1375         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1376                         HTTPMethodFallback, HEADRedirectHandler,
1377                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1378             opener.add_handler(handler())
1379
1380         response = opener.open(HeadRequest(url))
1381         if response is None:
1382             raise ExtractorError(u'Invalid URL protocol')
1383         new_url = response.geturl()
1384
1385         if url == new_url:
1386             return False
1387
1388         self.report_following_redirect(new_url)
1389         return new_url
1390
1391     def _real_extract(self, url):
1392         new_url = self._test_redirect(url)
1393         if new_url: return [self.url_result(new_url)]
1394
1395         video_id = url.split('/')[-1]
1396         try:
1397             webpage = self._download_webpage(url, video_id)
1398         except ValueError as err:
1399             # since this is the last-resort InfoExtractor, if
1400             # this error is thrown, it'll be thrown here
1401             raise ExtractorError(u'Invalid URL: %s' % url)
1402
1403         self.report_extraction(video_id)
1404         # Start with something easy: JW Player in SWFObject
1405         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1406         if mobj is None:
1407             # Broaden the search a little bit
1408             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1409         if mobj is None:
1410             # Broaden the search a little bit: JWPlayer JS loader
1411             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1412         if mobj is None:
1413             # Try to find twitter cards info
1414             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1415         if mobj is None:
1416             raise ExtractorError(u'Invalid URL: %s' % url)
1417
1418         # It's possible that one of the regexes
1419         # matched, but returned an empty group:
1420         if mobj.group(1) is None:
1421             raise ExtractorError(u'Invalid URL: %s' % url)
1422
1423         video_url = compat_urllib_parse.unquote(mobj.group(1))
1424         video_id = os.path.basename(video_url)
1425
1426         # here's a fun little line of code for you:
1427         video_extension = os.path.splitext(video_id)[1][1:]
1428         video_id = os.path.splitext(video_id)[0]
1429
1430         # it's tempting to parse this further, but you would
1431         # have to take into account all the variations like
1432         #   Video Title - Site Name
1433         #   Site Name | Video Title
1434         #   Video Title - Tagline | Site Name
1435         # and so on and so forth; it's just not practical
1436         video_title = self._html_search_regex(r'<title>(.*)</title>',
1437             webpage, u'video title')
1438
1439         # video uploader is domain name
1440         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1441             url, u'video uploader')
1442
1443         return [{
1444             'id':       video_id,
1445             'url':      video_url,
1446             'uploader': video_uploader,
1447             'upload_date':  None,
1448             'title':    video_title,
1449             'ext':      video_extension,
1450         }]
1451
1452
1453 class YoutubeSearchIE(SearchInfoExtractor):
1454     """Information Extractor for YouTube search queries."""
1455     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1456     _MAX_RESULTS = 1000
1457     IE_NAME = u'youtube:search'
1458     _SEARCH_KEY = 'ytsearch'
1459
1460     def report_download_page(self, query, pagenum):
1461         """Report attempt to download search page with given number."""
1462         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1463
1464     def _get_n_results(self, query, n):
1465         """Get a specified number of results for a query"""
1466
1467         video_ids = []
1468         pagenum = 0
1469         limit = n
1470
1471         while (50 * pagenum) < limit:
1472             self.report_download_page(query, pagenum+1)
1473             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1474             request = compat_urllib_request.Request(result_url)
1475             try:
1476                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1477             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1478                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1479             api_response = json.loads(data)['data']
1480
1481             if not 'items' in api_response:
1482                 raise ExtractorError(u'[youtube] No video results')
1483
1484             new_ids = list(video['id'] for video in api_response['items'])
1485             video_ids += new_ids
1486
1487             limit = min(n, api_response['totalItems'])
1488             pagenum += 1
1489
1490         if len(video_ids) > n:
1491             video_ids = video_ids[:n]
1492         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1493         return self.playlist_result(videos, query)
1494
1495
1496 class GoogleSearchIE(SearchInfoExtractor):
1497     """Information Extractor for Google Video search queries."""
1498     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1499     _MAX_RESULTS = 1000
1500     IE_NAME = u'video.google:search'
1501     _SEARCH_KEY = 'gvsearch'
1502
1503     def _get_n_results(self, query, n):
1504         """Get a specified number of results for a query"""
1505
1506         res = {
1507             '_type': 'playlist',
1508             'id': query,
1509             'entries': []
1510         }
1511
1512         for pagenum in itertools.count(1):
1513             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1514             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1515                                              note='Downloading result page ' + str(pagenum))
1516
1517             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1518                 e = {
1519                     '_type': 'url',
1520                     'url': mobj.group(1)
1521                 }
1522                 res['entries'].append(e)
1523
1524             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1525                 return res
1526
1527 class YahooSearchIE(SearchInfoExtractor):
1528     """Information Extractor for Yahoo! Video search queries."""
1529
1530     _MAX_RESULTS = 1000
1531     IE_NAME = u'screen.yahoo:search'
1532     _SEARCH_KEY = 'yvsearch'
1533
1534     def _get_n_results(self, query, n):
1535         """Get a specified number of results for a query"""
1536
1537         res = {
1538             '_type': 'playlist',
1539             'id': query,
1540             'entries': []
1541         }
1542         for pagenum in itertools.count(0):
1543             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1544             webpage = self._download_webpage(result_url, query,
1545                                              note='Downloading results page '+str(pagenum+1))
1546             info = json.loads(webpage)
1547             m = info[u'm']
1548             results = info[u'results']
1549
1550             for (i, r) in enumerate(results):
1551                 if (pagenum * 30) +i >= n:
1552                     break
1553                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1554                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1555                 res['entries'].append(e)
1556             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1557                 break
1558
1559         return res
1560
1561
1562 class YoutubePlaylistIE(InfoExtractor):
1563     """Information Extractor for YouTube playlists."""
1564
1565     _VALID_URL = r"""(?:
1566                         (?:https?://)?
1567                         (?:\w+\.)?
1568                         youtube\.com/
1569                         (?:
1570                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1571                            \? (?:.*?&)*? (?:p|a|list)=
1572                         |  p/
1573                         )
1574                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1575                         .*
1576                      |
1577                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1578                      )"""
1579     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1580     _MAX_RESULTS = 50
1581     IE_NAME = u'youtube:playlist'
1582
1583     @classmethod
1584     def suitable(cls, url):
1585         """Receives a URL and returns True if suitable for this IE."""
1586         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1587
1588     def _real_extract(self, url):
1589         # Extract playlist id
1590         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1591         if mobj is None:
1592             raise ExtractorError(u'Invalid URL: %s' % url)
1593
1594         # Download playlist videos from API
1595         playlist_id = mobj.group(1) or mobj.group(2)
1596         page_num = 1
1597         videos = []
1598
1599         while True:
1600             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1601             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1602
1603             try:
1604                 response = json.loads(page)
1605             except ValueError as err:
1606                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1607
1608             if 'feed' not in response:
1609                 raise ExtractorError(u'Got a malformed response from YouTube API')
1610             playlist_title = response['feed']['title']['$t']
1611             if 'entry' not in response['feed']:
1612                 # Number of videos is a multiple of self._MAX_RESULTS
1613                 break
1614
1615             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1616                         for entry in response['feed']['entry']
1617                         if 'content' in entry ]
1618
1619             if len(response['feed']['entry']) < self._MAX_RESULTS:
1620                 break
1621             page_num += 1
1622
1623         videos = [v[1] for v in sorted(videos)]
1624
1625         url_results = [self.url_result(url, 'Youtube') for url in videos]
1626         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1627
1628
1629 class YoutubeChannelIE(InfoExtractor):
1630     """Information Extractor for YouTube channels."""
1631
1632     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1633     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1634     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1635     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1636     IE_NAME = u'youtube:channel'
1637
1638     def extract_videos_from_page(self, page):
1639         ids_in_page = []
1640         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1641             if mobj.group(1) not in ids_in_page:
1642                 ids_in_page.append(mobj.group(1))
1643         return ids_in_page
1644
1645     def _real_extract(self, url):
1646         # Extract channel id
1647         mobj = re.match(self._VALID_URL, url)
1648         if mobj is None:
1649             raise ExtractorError(u'Invalid URL: %s' % url)
1650
1651         # Download channel page
1652         channel_id = mobj.group(1)
1653         video_ids = []
1654         pagenum = 1
1655
1656         url = self._TEMPLATE_URL % (channel_id, pagenum)
1657         page = self._download_webpage(url, channel_id,
1658                                       u'Downloading page #%s' % pagenum)
1659
1660         # Extract video identifiers
1661         ids_in_page = self.extract_videos_from_page(page)
1662         video_ids.extend(ids_in_page)
1663
1664         # Download any subsequent channel pages using the json-based channel_ajax query
1665         if self._MORE_PAGES_INDICATOR in page:
1666             while True:
1667                 pagenum = pagenum + 1
1668
1669                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1670                 page = self._download_webpage(url, channel_id,
1671                                               u'Downloading page #%s' % pagenum)
1672
1673                 page = json.loads(page)
1674
1675                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1676                 video_ids.extend(ids_in_page)
1677
1678                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1679                     break
1680
1681         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1682
1683         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1684         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1685         return [self.playlist_result(url_entries, channel_id)]
1686
1687
1688 class YoutubeUserIE(InfoExtractor):
1689     """Information Extractor for YouTube users."""
1690
1691     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1692     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1693     _GDATA_PAGE_SIZE = 50
1694     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1695     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1696     IE_NAME = u'youtube:user'
1697
1698     def _real_extract(self, url):
1699         # Extract username
1700         mobj = re.match(self._VALID_URL, url)
1701         if mobj is None:
1702             raise ExtractorError(u'Invalid URL: %s' % url)
1703
1704         username = mobj.group(1)
1705
1706         # Download video ids using YouTube Data API. Result size per
1707         # query is limited (currently to 50 videos) so we need to query
1708         # page by page until there are no video ids - it means we got
1709         # all of them.
1710
1711         video_ids = []
1712         pagenum = 0
1713
1714         while True:
1715             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1716
1717             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1718             page = self._download_webpage(gdata_url, username,
1719                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1720
1721             # Extract video identifiers
1722             ids_in_page = []
1723
1724             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1725                 if mobj.group(1) not in ids_in_page:
1726                     ids_in_page.append(mobj.group(1))
1727
1728             video_ids.extend(ids_in_page)
1729
1730             # A little optimization - if current page is not
1731             # "full", ie. does not contain PAGE_SIZE video ids then
1732             # we can assume that this page is the last one - there
1733             # are no more ids on further pages - no need to query
1734             # again.
1735
1736             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1737                 break
1738
1739             pagenum += 1
1740
1741         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1742         url_results = [self.url_result(url, 'Youtube') for url in urls]
1743         return [self.playlist_result(url_results, playlist_title = username)]
1744
1745
1746 class BlipTVUserIE(InfoExtractor):
1747     """Information Extractor for blip.tv users."""
1748
1749     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1750     _PAGE_SIZE = 12
1751     IE_NAME = u'blip.tv:user'
1752
1753     def _real_extract(self, url):
1754         # Extract username
1755         mobj = re.match(self._VALID_URL, url)
1756         if mobj is None:
1757             raise ExtractorError(u'Invalid URL: %s' % url)
1758
1759         username = mobj.group(1)
1760
1761         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1762
1763         page = self._download_webpage(url, username, u'Downloading user page')
1764         mobj = re.search(r'data-users-id="([^"]+)"', page)
1765         page_base = page_base % mobj.group(1)
1766
1767
1768         # Download video ids using BlipTV Ajax calls. Result size per
1769         # query is limited (currently to 12 videos) so we need to query
1770         # page by page until there are no video ids - it means we got
1771         # all of them.
1772
1773         video_ids = []
1774         pagenum = 1
1775
1776         while True:
1777             url = page_base + "&page=" + str(pagenum)
1778             page = self._download_webpage(url, username,
1779                                           u'Downloading video ids from page %d' % pagenum)
1780
1781             # Extract video identifiers
1782             ids_in_page = []
1783
1784             for mobj in re.finditer(r'href="/([^"]+)"', page):
1785                 if mobj.group(1) not in ids_in_page:
1786                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1787
1788             video_ids.extend(ids_in_page)
1789
1790             # A little optimization - if current page is not
1791             # "full", ie. does not contain PAGE_SIZE video ids then
1792             # we can assume that this page is the last one - there
1793             # are no more ids on further pages - no need to query
1794             # again.
1795
1796             if len(ids_in_page) < self._PAGE_SIZE:
1797                 break
1798
1799             pagenum += 1
1800
1801         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1802         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1803         return [self.playlist_result(url_entries, playlist_title = username)]
1804
1805
1806 class DepositFilesIE(InfoExtractor):
1807     """Information extractor for depositfiles.com"""
1808
1809     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1810
1811     def _real_extract(self, url):
1812         file_id = url.split('/')[-1]
1813         # Rebuild url in english locale
1814         url = 'http://depositfiles.com/en/files/' + file_id
1815
1816         # Retrieve file webpage with 'Free download' button pressed
1817         free_download_indication = { 'gateway_result' : '1' }
1818         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1819         try:
1820             self.report_download_webpage(file_id)
1821             webpage = compat_urllib_request.urlopen(request).read()
1822         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1823             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1824
1825         # Search for the real file URL
1826         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1827         if (mobj is None) or (mobj.group(1) is None):
1828             # Try to figure out reason of the error.
1829             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1830             if (mobj is not None) and (mobj.group(1) is not None):
1831                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1832                 raise ExtractorError(u'%s' % restriction_message)
1833             else:
1834                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1835
1836         file_url = mobj.group(1)
1837         file_extension = os.path.splitext(file_url)[1][1:]
1838
1839         # Search for file title
1840         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1841
1842         return [{
1843             'id':       file_id.decode('utf-8'),
1844             'url':      file_url.decode('utf-8'),
1845             'uploader': None,
1846             'upload_date':  None,
1847             'title':    file_title,
1848             'ext':      file_extension.decode('utf-8'),
1849         }]
1850
1851
1852 class FacebookIE(InfoExtractor):
1853     """Information Extractor for Facebook"""
1854
1855     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1856     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1857     _NETRC_MACHINE = 'facebook'
1858     IE_NAME = u'facebook'
1859
1860     def report_login(self):
1861         """Report attempt to log in."""
1862         self.to_screen(u'Logging in')
1863
1864     def _real_initialize(self):
1865         if self._downloader is None:
1866             return
1867
1868         useremail = None
1869         password = None
1870         downloader_params = self._downloader.params
1871
1872         # Attempt to use provided username and password or .netrc data
1873         if downloader_params.get('username', None) is not None:
1874             useremail = downloader_params['username']
1875             password = downloader_params['password']
1876         elif downloader_params.get('usenetrc', False):
1877             try:
1878                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1879                 if info is not None:
1880                     useremail = info[0]
1881                     password = info[2]
1882                 else:
1883                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1884             except (IOError, netrc.NetrcParseError) as err:
1885                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1886                 return
1887
1888         if useremail is None:
1889             return
1890
1891         # Log in
1892         login_form = {
1893             'email': useremail,
1894             'pass': password,
1895             'login': 'Log+In'
1896             }
1897         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1898         try:
1899             self.report_login()
1900             login_results = compat_urllib_request.urlopen(request).read()
1901             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1902                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1903                 return
1904         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1905             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1906             return
1907
1908     def _real_extract(self, url):
1909         mobj = re.match(self._VALID_URL, url)
1910         if mobj is None:
1911             raise ExtractorError(u'Invalid URL: %s' % url)
1912         video_id = mobj.group('ID')
1913
1914         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1915         webpage = self._download_webpage(url, video_id)
1916
1917         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1918         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1919         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1920         if not m:
1921             raise ExtractorError(u'Cannot parse data')
1922         data = dict(json.loads(m.group(1)))
1923         params_raw = compat_urllib_parse.unquote(data['params'])
1924         params = json.loads(params_raw)
1925         video_data = params['video_data'][0]
1926         video_url = video_data.get('hd_src')
1927         if not video_url:
1928             video_url = video_data['sd_src']
1929         if not video_url:
1930             raise ExtractorError(u'Cannot find video URL')
1931         video_duration = int(video_data['video_duration'])
1932         thumbnail = video_data['thumbnail_src']
1933
1934         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1935             webpage, u'title')
1936
1937         info = {
1938             'id': video_id,
1939             'title': video_title,
1940             'url': video_url,
1941             'ext': 'mp4',
1942             'duration': video_duration,
1943             'thumbnail': thumbnail,
1944         }
1945         return [info]
1946
1947
1948 class BlipTVIE(InfoExtractor):
1949     """Information extractor for blip.tv"""
1950
1951     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1952     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1953     IE_NAME = u'blip.tv'
1954
1955     def report_direct_download(self, title):
1956         """Report information extraction."""
1957         self.to_screen(u'%s: Direct download detected' % title)
1958
1959     def _real_extract(self, url):
1960         mobj = re.match(self._VALID_URL, url)
1961         if mobj is None:
1962             raise ExtractorError(u'Invalid URL: %s' % url)
1963
1964         # See https://github.com/rg3/youtube-dl/issues/857
1965         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1966         if api_mobj is not None:
1967             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1968         urlp = compat_urllib_parse_urlparse(url)
1969         if urlp.path.startswith('/play/'):
1970             request = compat_urllib_request.Request(url)
1971             response = compat_urllib_request.urlopen(request)
1972             redirecturl = response.geturl()
1973             rurlp = compat_urllib_parse_urlparse(redirecturl)
1974             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1975             url = 'http://blip.tv/a/a-' + file_id
1976             return self._real_extract(url)
1977
1978
1979         if '?' in url:
1980             cchar = '&'
1981         else:
1982             cchar = '?'
1983         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1984         request = compat_urllib_request.Request(json_url)
1985         request.add_header('User-Agent', 'iTunes/10.6.1')
1986         self.report_extraction(mobj.group(1))
1987         info = None
1988         try:
1989             urlh = compat_urllib_request.urlopen(request)
1990             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1991                 basename = url.split('/')[-1]
1992                 title,ext = os.path.splitext(basename)
1993                 title = title.decode('UTF-8')
1994                 ext = ext.replace('.', '')
1995                 self.report_direct_download(title)
1996                 info = {
1997                     'id': title,
1998                     'url': url,
1999                     'uploader': None,
2000                     'upload_date': None,
2001                     'title': title,
2002                     'ext': ext,
2003                     'urlhandle': urlh
2004                 }
2005         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2006             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2007         if info is None: # Regular URL
2008             try:
2009                 json_code_bytes = urlh.read()
2010                 json_code = json_code_bytes.decode('utf-8')
2011             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2012                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2013
2014             try:
2015                 json_data = json.loads(json_code)
2016                 if 'Post' in json_data:
2017                     data = json_data['Post']
2018                 else:
2019                     data = json_data
2020
2021                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2022                 video_url = data['media']['url']
2023                 umobj = re.match(self._URL_EXT, video_url)
2024                 if umobj is None:
2025                     raise ValueError('Can not determine filename extension')
2026                 ext = umobj.group(1)
2027
2028                 info = {
2029                     'id': data['item_id'],
2030                     'url': video_url,
2031                     'uploader': data['display_name'],
2032                     'upload_date': upload_date,
2033                     'title': data['title'],
2034                     'ext': ext,
2035                     'format': data['media']['mimeType'],
2036                     'thumbnail': data['thumbnailUrl'],
2037                     'description': data['description'],
2038                     'player_url': data['embedUrl'],
2039                     'user_agent': 'iTunes/10.6.1',
2040                 }
2041             except (ValueError,KeyError) as err:
2042                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2043
2044         return [info]
2045
2046
2047 class MyVideoIE(InfoExtractor):
2048     """Information Extractor for myvideo.de."""
2049
2050     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2051     IE_NAME = u'myvideo'
2052
2053     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2054     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2055     # https://github.com/rg3/youtube-dl/pull/842
2056     def __rc4crypt(self,data, key):
2057         x = 0
2058         box = list(range(256))
2059         for i in list(range(256)):
2060             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2061             box[i], box[x] = box[x], box[i]
2062         x = 0
2063         y = 0
2064         out = ''
2065         for char in data:
2066             x = (x + 1) % 256
2067             y = (y + box[x]) % 256
2068             box[x], box[y] = box[y], box[x]
2069             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2070         return out
2071
2072     def __md5(self,s):
2073         return hashlib.md5(s).hexdigest().encode()
2074
2075     def _real_extract(self,url):
2076         mobj = re.match(self._VALID_URL, url)
2077         if mobj is None:
2078             raise ExtractorError(u'invalid URL: %s' % url)
2079
2080         video_id = mobj.group(1)
2081
2082         GK = (
2083           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2084           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2085           b'TnpsbA0KTVRkbU1tSTRNdz09'
2086         )
2087
2088         # Get video webpage
2089         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2090         webpage = self._download_webpage(webpage_url, video_id)
2091
2092         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2093         if mobj is not None:
2094             self.report_extraction(video_id)
2095             video_url = mobj.group(1) + '.flv'
2096
2097             video_title = self._html_search_regex('<title>([^<]+)</title>',
2098                 webpage, u'title')
2099
2100             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2101
2102             return [{
2103                 'id':       video_id,
2104                 'url':      video_url,
2105                 'uploader': None,
2106                 'upload_date':  None,
2107                 'title':    video_title,
2108                 'ext':      u'flv',
2109             }]
2110
2111         # try encxml
2112         mobj = re.search('var flashvars={(.+?)}', webpage)
2113         if mobj is None:
2114             raise ExtractorError(u'Unable to extract video')
2115
2116         params = {}
2117         encxml = ''
2118         sec = mobj.group(1)
2119         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2120             if not a == '_encxml':
2121                 params[a] = b
2122             else:
2123                 encxml = compat_urllib_parse.unquote(b)
2124         if not params.get('domain'):
2125             params['domain'] = 'www.myvideo.de'
2126         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2127         if 'flash_playertype=MTV' in xmldata_url:
2128             self._downloader.report_warning(u'avoiding MTV player')
2129             xmldata_url = (
2130                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2131                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2132             ) % video_id
2133
2134         # get enc data
2135         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2136         enc_data_b = binascii.unhexlify(enc_data)
2137         sk = self.__md5(
2138             base64.b64decode(base64.b64decode(GK)) +
2139             self.__md5(
2140                 str(video_id).encode('utf-8')
2141             )
2142         )
2143         dec_data = self.__rc4crypt(enc_data_b, sk)
2144
2145         # extracting infos
2146         self.report_extraction(video_id)
2147
2148         video_url = None
2149         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2150         if mobj:
2151             video_url = compat_urllib_parse.unquote(mobj.group(1))
2152             if 'myvideo2flash' in video_url:
2153                 self._downloader.report_warning(u'forcing RTMPT ...')
2154                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2155
2156         if not video_url:
2157             # extract non rtmp videos
2158             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2159             if mobj is None:
2160                 raise ExtractorError(u'unable to extract url')
2161             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2162
2163         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2164         video_file = compat_urllib_parse.unquote(video_file)
2165
2166         if not video_file.endswith('f4m'):
2167             ppath, prefix = video_file.split('.')
2168             video_playpath = '%s:%s' % (prefix, ppath)
2169             video_hls_playlist = ''
2170         else:
2171             video_playpath = ''
2172             video_hls_playlist = (
2173                 video_filepath + video_file
2174             ).replace('.f4m', '.m3u8')
2175
2176         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2177         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2178
2179         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2180             webpage, u'title')
2181
2182         return [{
2183             'id':                 video_id,
2184             'url':                video_url,
2185             'tc_url':             video_url,
2186             'uploader':           None,
2187             'upload_date':        None,
2188             'title':              video_title,
2189             'ext':                u'flv',
2190             'play_path':          video_playpath,
2191             'video_file':         video_file,
2192             'video_hls_playlist': video_hls_playlist,
2193             'player_url':         video_swfobj,
2194         }]
2195
2196
2197 class ComedyCentralIE(InfoExtractor):
2198     """Information extractor for The Daily Show and Colbert Report """
2199
2200     # urls can be abbreviations like :thedailyshow or :colbert
2201     # urls for episodes like:
2202     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2203     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2204     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2205     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2206                       |(https?://)?(www\.)?
2207                           (?P<showname>thedailyshow|colbertnation)\.com/
2208                          (full-episodes/(?P<episode>.*)|
2209                           (?P<clip>
2210                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2211                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2212                      $"""
2213
2214     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2215
2216     _video_extensions = {
2217         '3500': 'mp4',
2218         '2200': 'mp4',
2219         '1700': 'mp4',
2220         '1200': 'mp4',
2221         '750': 'mp4',
2222         '400': 'mp4',
2223     }
2224     _video_dimensions = {
2225         '3500': '1280x720',
2226         '2200': '960x540',
2227         '1700': '768x432',
2228         '1200': '640x360',
2229         '750': '512x288',
2230         '400': '384x216',
2231     }
2232
2233     @classmethod
2234     def suitable(cls, url):
2235         """Receives a URL and returns True if suitable for this IE."""
2236         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2237
2238     def _print_formats(self, formats):
2239         print('Available formats:')
2240         for x in formats:
2241             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2242
2243
2244     def _real_extract(self, url):
2245         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2246         if mobj is None:
2247             raise ExtractorError(u'Invalid URL: %s' % url)
2248
2249         if mobj.group('shortname'):
2250             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2251                 url = u'http://www.thedailyshow.com/full-episodes/'
2252             else:
2253                 url = u'http://www.colbertnation.com/full-episodes/'
2254             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2255             assert mobj is not None
2256
2257         if mobj.group('clip'):
2258             if mobj.group('showname') == 'thedailyshow':
2259                 epTitle = mobj.group('tdstitle')
2260             else:
2261                 epTitle = mobj.group('cntitle')
2262             dlNewest = False
2263         else:
2264             dlNewest = not mobj.group('episode')
2265             if dlNewest:
2266                 epTitle = mobj.group('showname')
2267             else:
2268                 epTitle = mobj.group('episode')
2269
2270         self.report_extraction(epTitle)
2271         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2272         if dlNewest:
2273             url = htmlHandle.geturl()
2274             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2275             if mobj is None:
2276                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2277             if mobj.group('episode') == '':
2278                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2279             epTitle = mobj.group('episode')
2280
2281         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2282
2283         if len(mMovieParams) == 0:
2284             # The Colbert Report embeds the information in a without
2285             # a URL prefix; so extract the alternate reference
2286             # and then add the URL prefix manually.
2287
2288             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2289             if len(altMovieParams) == 0:
2290                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2291             else:
2292                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2293
2294         uri = mMovieParams[0][1]
2295         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2296         indexXml = self._download_webpage(indexUrl, epTitle,
2297                                           u'Downloading show index',
2298                                           u'unable to download episode index')
2299
2300         results = []
2301
2302         idoc = xml.etree.ElementTree.fromstring(indexXml)
2303         itemEls = idoc.findall('.//item')
2304         for partNum,itemEl in enumerate(itemEls):
2305             mediaId = itemEl.findall('./guid')[0].text
2306             shortMediaId = mediaId.split(':')[-1]
2307             showId = mediaId.split(':')[-2].replace('.com', '')
2308             officialTitle = itemEl.findall('./title')[0].text
2309             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2310
2311             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2312                         compat_urllib_parse.urlencode({'uri': mediaId}))
2313             configXml = self._download_webpage(configUrl, epTitle,
2314                                                u'Downloading configuration for %s' % shortMediaId)
2315
2316             cdoc = xml.etree.ElementTree.fromstring(configXml)
2317             turls = []
2318             for rendition in cdoc.findall('.//rendition'):
2319                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2320                 turls.append(finfo)
2321
2322             if len(turls) == 0:
2323                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2324                 continue
2325
2326             if self._downloader.params.get('listformats', None):
2327                 self._print_formats([i[0] for i in turls])
2328                 return
2329
2330             # For now, just pick the highest bitrate
2331             format,rtmp_video_url = turls[-1]
2332
2333             # Get the format arg from the arg stream
2334             req_format = self._downloader.params.get('format', None)
2335
2336             # Select format if we can find one
2337             for f,v in turls:
2338                 if f == req_format:
2339                     format, rtmp_video_url = f, v
2340                     break
2341
2342             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2343             if not m:
2344                 raise ExtractorError(u'Cannot transform RTMP url')
2345             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2346             video_url = base + m.group('finalid')
2347
2348             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2349             info = {
2350                 'id': shortMediaId,
2351                 'url': video_url,
2352                 'uploader': showId,
2353                 'upload_date': officialDate,
2354                 'title': effTitle,
2355                 'ext': 'mp4',
2356                 'format': format,
2357                 'thumbnail': None,
2358                 'description': officialTitle,
2359             }
2360             results.append(info)
2361
2362         return results
2363
2364
2365 class EscapistIE(InfoExtractor):
2366     """Information extractor for The Escapist """
2367
2368     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2369     IE_NAME = u'escapist'
2370
2371     def _real_extract(self, url):
2372         mobj = re.match(self._VALID_URL, url)
2373         if mobj is None:
2374             raise ExtractorError(u'Invalid URL: %s' % url)
2375         showName = mobj.group('showname')
2376         videoId = mobj.group('episode')
2377
2378         self.report_extraction(videoId)
2379         webpage = self._download_webpage(url, videoId)
2380
2381         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2382             webpage, u'description', fatal=False)
2383
2384         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2385             webpage, u'thumbnail', fatal=False)
2386
2387         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2388             webpage, u'player url')
2389
2390         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2391             webpage, u'player url').split(' : ')[-1]
2392
2393         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2394         configUrl = compat_urllib_parse.unquote(configUrl)
2395
2396         configJSON = self._download_webpage(configUrl, videoId,
2397                                             u'Downloading configuration',
2398                                             u'unable to download configuration')
2399
2400         # Technically, it's JavaScript, not JSON
2401         configJSON = configJSON.replace("'", '"')
2402
2403         try:
2404             config = json.loads(configJSON)
2405         except (ValueError,) as err:
2406             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2407
2408         playlist = config['playlist']
2409         videoUrl = playlist[1]['url']
2410
2411         info = {
2412             'id': videoId,
2413             'url': videoUrl,
2414             'uploader': showName,
2415             'upload_date': None,
2416             'title': title,
2417             'ext': 'mp4',
2418             'thumbnail': imgUrl,
2419             'description': videoDesc,
2420             'player_url': playerUrl,
2421         }
2422
2423         return [info]
2424
2425 class CollegeHumorIE(InfoExtractor):
2426     """Information extractor for collegehumor.com"""
2427
2428     _WORKING = False
2429     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2430     IE_NAME = u'collegehumor'
2431
2432     def report_manifest(self, video_id):
2433         """Report information extraction."""
2434         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2435
2436     def _real_extract(self, url):
2437         mobj = re.match(self._VALID_URL, url)
2438         if mobj is None:
2439             raise ExtractorError(u'Invalid URL: %s' % url)
2440         video_id = mobj.group('videoid')
2441
2442         info = {
2443             'id': video_id,
2444             'uploader': None,
2445             'upload_date': None,
2446         }
2447
2448         self.report_extraction(video_id)
2449         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2450         try:
2451             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2452         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2453             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2454
2455         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2456         try:
2457             videoNode = mdoc.findall('./video')[0]
2458             info['description'] = videoNode.findall('./description')[0].text
2459             info['title'] = videoNode.findall('./caption')[0].text
2460             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2461             manifest_url = videoNode.findall('./file')[0].text
2462         except IndexError:
2463             raise ExtractorError(u'Invalid metadata XML file')
2464
2465         manifest_url += '?hdcore=2.10.3'
2466         self.report_manifest(video_id)
2467         try:
2468             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2469         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2470             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2471
2472         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2473         try:
2474             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2475             node_id = media_node.attrib['url']
2476             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2477         except IndexError as err:
2478             raise ExtractorError(u'Invalid manifest file')
2479
2480         url_pr = compat_urllib_parse_urlparse(manifest_url)
2481         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2482
2483         info['url'] = url
2484         info['ext'] = 'f4f'
2485         return [info]
2486
2487
2488 class XVideosIE(InfoExtractor):
2489     """Information extractor for xvideos.com"""
2490
2491     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2492     IE_NAME = u'xvideos'
2493
2494     def _real_extract(self, url):
2495         mobj = re.match(self._VALID_URL, url)
2496         if mobj is None:
2497             raise ExtractorError(u'Invalid URL: %s' % url)
2498         video_id = mobj.group(1)
2499
2500         webpage = self._download_webpage(url, video_id)
2501
2502         self.report_extraction(video_id)
2503
2504         # Extract video URL
2505         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2506             webpage, u'video URL'))
2507
2508         # Extract title
2509         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2510             webpage, u'title')
2511
2512         # Extract video thumbnail
2513         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2514             webpage, u'thumbnail', fatal=False)
2515
2516         info = {
2517             'id': video_id,
2518             'url': video_url,
2519             'uploader': None,
2520             'upload_date': None,
2521             'title': video_title,
2522             'ext': 'flv',
2523             'thumbnail': video_thumbnail,
2524             'description': None,
2525         }
2526
2527         return [info]
2528
2529
2530 class SoundcloudIE(InfoExtractor):
2531     """Information extractor for soundcloud.com
2532        To access the media, the uid of the song and a stream token
2533        must be extracted from the page source and the script must make
2534        a request to media.soundcloud.com/crossdomain.xml. Then
2535        the media can be grabbed by requesting from an url composed
2536        of the stream token and uid
2537      """
2538
2539     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2540     IE_NAME = u'soundcloud'
2541
2542     def report_resolve(self, video_id):
2543         """Report information extraction."""
2544         self.to_screen(u'%s: Resolving id' % video_id)
2545
2546     def _real_extract(self, url):
2547         mobj = re.match(self._VALID_URL, url)
2548         if mobj is None:
2549             raise ExtractorError(u'Invalid URL: %s' % url)
2550
2551         # extract uploader (which is in the url)
2552         uploader = mobj.group(1)
2553         # extract simple title (uploader + slug of song title)
2554         slug_title =  mobj.group(2)
2555         simple_title = uploader + u'-' + slug_title
2556         full_title = '%s/%s' % (uploader, slug_title)
2557
2558         self.report_resolve(full_title)
2559
2560         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2561         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2562         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2563
2564         info = json.loads(info_json)
2565         video_id = info['id']
2566         self.report_extraction(full_title)
2567
2568         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2569         stream_json = self._download_webpage(streams_url, full_title,
2570                                              u'Downloading stream definitions',
2571                                              u'unable to download stream definitions')
2572
2573         streams = json.loads(stream_json)
2574         mediaURL = streams['http_mp3_128_url']
2575         upload_date = unified_strdate(info['created_at'])
2576
2577         return [{
2578             'id':       info['id'],
2579             'url':      mediaURL,
2580             'uploader': info['user']['username'],
2581             'upload_date': upload_date,
2582             'title':    info['title'],
2583             'ext':      u'mp3',
2584             'description': info['description'],
2585         }]
2586
2587 class SoundcloudSetIE(InfoExtractor):
2588     """Information extractor for soundcloud.com sets
2589        To access the media, the uid of the song and a stream token
2590        must be extracted from the page source and the script must make
2591        a request to media.soundcloud.com/crossdomain.xml. Then
2592        the media can be grabbed by requesting from an url composed
2593        of the stream token and uid
2594      """
2595
2596     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2597     IE_NAME = u'soundcloud:set'
2598
2599     def report_resolve(self, video_id):
2600         """Report information extraction."""
2601         self.to_screen(u'%s: Resolving id' % video_id)
2602
2603     def _real_extract(self, url):
2604         mobj = re.match(self._VALID_URL, url)
2605         if mobj is None:
2606             raise ExtractorError(u'Invalid URL: %s' % url)
2607
2608         # extract uploader (which is in the url)
2609         uploader = mobj.group(1)
2610         # extract simple title (uploader + slug of song title)
2611         slug_title =  mobj.group(2)
2612         simple_title = uploader + u'-' + slug_title
2613         full_title = '%s/sets/%s' % (uploader, slug_title)
2614
2615         self.report_resolve(full_title)
2616
2617         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2618         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2619         info_json = self._download_webpage(resolv_url, full_title)
2620
2621         videos = []
2622         info = json.loads(info_json)
2623         if 'errors' in info:
2624             for err in info['errors']:
2625                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2626             return
2627
2628         self.report_extraction(full_title)
2629         for track in info['tracks']:
2630             video_id = track['id']
2631
2632             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2633             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2634
2635             self.report_extraction(video_id)
2636             streams = json.loads(stream_json)
2637             mediaURL = streams['http_mp3_128_url']
2638
2639             videos.append({
2640                 'id':       video_id,
2641                 'url':      mediaURL,
2642                 'uploader': track['user']['username'],
2643                 'upload_date':  unified_strdate(track['created_at']),
2644                 'title':    track['title'],
2645                 'ext':      u'mp3',
2646                 'description': track['description'],
2647             })
2648         return videos
2649
2650
2651 class InfoQIE(InfoExtractor):
2652     """Information extractor for infoq.com"""
2653     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2654
2655     def _real_extract(self, url):
2656         mobj = re.match(self._VALID_URL, url)
2657         if mobj is None:
2658             raise ExtractorError(u'Invalid URL: %s' % url)
2659
2660         webpage = self._download_webpage(url, video_id=url)
2661         self.report_extraction(url)
2662
2663         # Extract video URL
2664         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2665         if mobj is None:
2666             raise ExtractorError(u'Unable to extract video url')
2667         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2668         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2669
2670         # Extract title
2671         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2672             webpage, u'title')
2673
2674         # Extract description
2675         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2676             webpage, u'description', fatal=False)
2677
2678         video_filename = video_url.split('/')[-1]
2679         video_id, extension = video_filename.split('.')
2680
2681         info = {
2682             'id': video_id,
2683             'url': video_url,
2684             'uploader': None,
2685             'upload_date': None,
2686             'title': video_title,
2687             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2688             'thumbnail': None,
2689             'description': video_description,
2690         }
2691
2692         return [info]
2693
2694 class MixcloudIE(InfoExtractor):
2695     """Information extractor for www.mixcloud.com"""
2696
2697     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2698     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2699     IE_NAME = u'mixcloud'
2700
2701     def report_download_json(self, file_id):
2702         """Report JSON download."""
2703         self.to_screen(u'Downloading json')
2704
2705     def get_urls(self, jsonData, fmt, bitrate='best'):
2706         """Get urls from 'audio_formats' section in json"""
2707         file_url = None
2708         try:
2709             bitrate_list = jsonData[fmt]
2710             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2711                 bitrate = max(bitrate_list) # select highest
2712
2713             url_list = jsonData[fmt][bitrate]
2714         except TypeError: # we have no bitrate info.
2715             url_list = jsonData[fmt]
2716         return url_list
2717
2718     def check_urls(self, url_list):
2719         """Returns 1st active url from list"""
2720         for url in url_list:
2721             try:
2722                 compat_urllib_request.urlopen(url)
2723                 return url
2724             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2725                 url = None
2726
2727         return None
2728
2729     def _print_formats(self, formats):
2730         print('Available formats:')
2731         for fmt in formats.keys():
2732             for b in formats[fmt]:
2733                 try:
2734                     ext = formats[fmt][b][0]
2735                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2736                 except TypeError: # we have no bitrate info
2737                     ext = formats[fmt][0]
2738                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2739                     break
2740
2741     def _real_extract(self, url):
2742         mobj = re.match(self._VALID_URL, url)
2743         if mobj is None:
2744             raise ExtractorError(u'Invalid URL: %s' % url)
2745         # extract uploader & filename from url
2746         uploader = mobj.group(1).decode('utf-8')
2747         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2748
2749         # construct API request
2750         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2751         # retrieve .json file with links to files
2752         request = compat_urllib_request.Request(file_url)
2753         try:
2754             self.report_download_json(file_url)
2755             jsonData = compat_urllib_request.urlopen(request).read()
2756         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2757             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2758
2759         # parse JSON
2760         json_data = json.loads(jsonData)
2761         player_url = json_data['player_swf_url']
2762         formats = dict(json_data['audio_formats'])
2763
2764         req_format = self._downloader.params.get('format', None)
2765         bitrate = None
2766
2767         if self._downloader.params.get('listformats', None):
2768             self._print_formats(formats)
2769             return
2770
2771         if req_format is None or req_format == 'best':
2772             for format_param in formats.keys():
2773                 url_list = self.get_urls(formats, format_param)
2774                 # check urls
2775                 file_url = self.check_urls(url_list)
2776                 if file_url is not None:
2777                     break # got it!
2778         else:
2779             if req_format not in formats:
2780                 raise ExtractorError(u'Format is not available')
2781
2782             url_list = self.get_urls(formats, req_format)
2783             file_url = self.check_urls(url_list)
2784             format_param = req_format
2785
2786         return [{
2787             'id': file_id.decode('utf-8'),
2788             'url': file_url.decode('utf-8'),
2789             'uploader': uploader.decode('utf-8'),
2790             'upload_date': None,
2791             'title': json_data['name'],
2792             'ext': file_url.split('.')[-1].decode('utf-8'),
2793             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2794             'thumbnail': json_data['thumbnail_url'],
2795             'description': json_data['description'],
2796             'player_url': player_url.decode('utf-8'),
2797         }]
2798
2799 class StanfordOpenClassroomIE(InfoExtractor):
2800     """Information extractor for Stanford's Open ClassRoom"""
2801
2802     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2803     IE_NAME = u'stanfordoc'
2804
2805     def _real_extract(self, url):
2806         mobj = re.match(self._VALID_URL, url)
2807         if mobj is None:
2808             raise ExtractorError(u'Invalid URL: %s' % url)
2809
2810         if mobj.group('course') and mobj.group('video'): # A specific video
2811             course = mobj.group('course')
2812             video = mobj.group('video')
2813             info = {
2814                 'id': course + '_' + video,
2815                 'uploader': None,
2816                 'upload_date': None,
2817             }
2818
2819             self.report_extraction(info['id'])
2820             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2821             xmlUrl = baseUrl + video + '.xml'
2822             try:
2823                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2824             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2825                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2826             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2827             try:
2828                 info['title'] = mdoc.findall('./title')[0].text
2829                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2830             except IndexError:
2831                 raise ExtractorError(u'Invalid metadata XML file')
2832             info['ext'] = info['url'].rpartition('.')[2]
2833             return [info]
2834         elif mobj.group('course'): # A course page
2835             course = mobj.group('course')
2836             info = {
2837                 'id': course,
2838                 'type': 'playlist',
2839                 'uploader': None,
2840                 'upload_date': None,
2841             }
2842
2843             coursepage = self._download_webpage(url, info['id'],
2844                                         note='Downloading course info page',
2845                                         errnote='Unable to download course info page')
2846
2847             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2848
2849             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2850                 coursepage, u'description', fatal=False)
2851
2852             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2853             info['list'] = [
2854                 {
2855                     'type': 'reference',
2856                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2857                 }
2858                     for vpage in links]
2859             results = []
2860             for entry in info['list']:
2861                 assert entry['type'] == 'reference'
2862                 results += self.extract(entry['url'])
2863             return results
2864         else: # Root page
2865             info = {
2866                 'id': 'Stanford OpenClassroom',
2867                 'type': 'playlist',
2868                 'uploader': None,
2869                 'upload_date': None,
2870             }
2871
2872             self.report_download_webpage(info['id'])
2873             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2874             try:
2875                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2876             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2877                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2878
2879             info['title'] = info['id']
2880
2881             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2882             info['list'] = [
2883                 {
2884                     'type': 'reference',
2885                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2886                 }
2887                     for cpage in links]
2888
2889             results = []
2890             for entry in info['list']:
2891                 assert entry['type'] == 'reference'
2892                 results += self.extract(entry['url'])
2893             return results
2894
2895 class MTVIE(InfoExtractor):
2896     """Information extractor for MTV.com"""
2897
2898     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2899     IE_NAME = u'mtv'
2900
2901     def _real_extract(self, url):
2902         mobj = re.match(self._VALID_URL, url)
2903         if mobj is None:
2904             raise ExtractorError(u'Invalid URL: %s' % url)
2905         if not mobj.group('proto'):
2906             url = 'http://' + url
2907         video_id = mobj.group('videoid')
2908
2909         webpage = self._download_webpage(url, video_id)
2910
2911         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2912             webpage, u'song name', fatal=False)
2913
2914         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2915             webpage, u'title')
2916
2917         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2918             webpage, u'mtvn_uri', fatal=False)
2919
2920         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2921             webpage, u'content id', fatal=False)
2922
2923         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2924         self.report_extraction(video_id)
2925         request = compat_urllib_request.Request(videogen_url)
2926         try:
2927             metadataXml = compat_urllib_request.urlopen(request).read()
2928         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2929             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2930
2931         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2932         renditions = mdoc.findall('.//rendition')
2933
2934         # For now, always pick the highest quality.
2935         rendition = renditions[-1]
2936
2937         try:
2938             _,_,ext = rendition.attrib['type'].partition('/')
2939             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2940             video_url = rendition.find('./src').text
2941         except KeyError:
2942             raise ExtractorError('Invalid rendition field.')
2943
2944         info = {
2945             'id': video_id,
2946             'url': video_url,
2947             'uploader': performer,
2948             'upload_date': None,
2949             'title': video_title,
2950             'ext': ext,
2951             'format': format,
2952         }
2953
2954         return [info]
2955
2956
2957 class YoukuIE(InfoExtractor):
2958     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2959
2960     def _gen_sid(self):
2961         nowTime = int(time.time() * 1000)
2962         random1 = random.randint(1000,1998)
2963         random2 = random.randint(1000,9999)
2964
2965         return "%d%d%d" %(nowTime,random1,random2)
2966
2967     def _get_file_ID_mix_string(self, seed):
2968         mixed = []
2969         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2970         seed = float(seed)
2971         for i in range(len(source)):
2972             seed  =  (seed * 211 + 30031 ) % 65536
2973             index  =  math.floor(seed / 65536 * len(source) )
2974             mixed.append(source[int(index)])
2975             source.remove(source[int(index)])
2976         #return ''.join(mixed)
2977         return mixed
2978
2979     def _get_file_id(self, fileId, seed):
2980         mixed = self._get_file_ID_mix_string(seed)
2981         ids = fileId.split('*')
2982         realId = []
2983         for ch in ids:
2984             if ch:
2985                 realId.append(mixed[int(ch)])
2986         return ''.join(realId)
2987
2988     def _real_extract(self, url):
2989         mobj = re.match(self._VALID_URL, url)
2990         if mobj is None:
2991             raise ExtractorError(u'Invalid URL: %s' % url)
2992         video_id = mobj.group('ID')
2993
2994         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2995
2996         jsondata = self._download_webpage(info_url, video_id)
2997
2998         self.report_extraction(video_id)
2999         try:
3000             config = json.loads(jsondata)
3001
3002             video_title =  config['data'][0]['title']
3003             seed = config['data'][0]['seed']
3004
3005             format = self._downloader.params.get('format', None)
3006             supported_format = list(config['data'][0]['streamfileids'].keys())
3007
3008             if format is None or format == 'best':
3009                 if 'hd2' in supported_format:
3010                     format = 'hd2'
3011                 else:
3012                     format = 'flv'
3013                 ext = u'flv'
3014             elif format == 'worst':
3015                 format = 'mp4'
3016                 ext = u'mp4'
3017             else:
3018                 format = 'flv'
3019                 ext = u'flv'
3020
3021
3022             fileid = config['data'][0]['streamfileids'][format]
3023             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3024         except (UnicodeDecodeError, ValueError, KeyError):
3025             raise ExtractorError(u'Unable to extract info section')
3026
3027         files_info=[]
3028         sid = self._gen_sid()
3029         fileid = self._get_file_id(fileid, seed)
3030
3031         #column 8,9 of fileid represent the segment number
3032         #fileid[7:9] should be changed
3033         for index, key in enumerate(keys):
3034
3035             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3036             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3037
3038             info = {
3039                 'id': '%s_part%02d' % (video_id, index),
3040                 'url': download_url,
3041                 'uploader': None,
3042                 'upload_date': None,
3043                 'title': video_title,
3044                 'ext': ext,
3045             }
3046             files_info.append(info)
3047
3048         return files_info
3049
3050
3051 class XNXXIE(InfoExtractor):
3052     """Information extractor for xnxx.com"""
3053
3054     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3055     IE_NAME = u'xnxx'
3056     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3057     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3058     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3059
3060     def _real_extract(self, url):
3061         mobj = re.match(self._VALID_URL, url)
3062         if mobj is None:
3063             raise ExtractorError(u'Invalid URL: %s' % url)
3064         video_id = mobj.group(1)
3065
3066         # Get webpage content
3067         webpage = self._download_webpage(url, video_id)
3068
3069         video_url = self._search_regex(self.VIDEO_URL_RE,
3070             webpage, u'video URL')
3071         video_url = compat_urllib_parse.unquote(video_url)
3072
3073         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3074             webpage, u'title')
3075
3076         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3077             webpage, u'thumbnail', fatal=False)
3078
3079         return [{
3080             'id': video_id,
3081             'url': video_url,
3082             'uploader': None,
3083             'upload_date': None,
3084             'title': video_title,
3085             'ext': 'flv',
3086             'thumbnail': video_thumbnail,
3087             'description': None,
3088         }]
3089
3090
3091 class GooglePlusIE(InfoExtractor):
3092     """Information extractor for plus.google.com."""
3093
3094     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3095     IE_NAME = u'plus.google'
3096
3097     def _real_extract(self, url):
3098         # Extract id from URL
3099         mobj = re.match(self._VALID_URL, url)
3100         if mobj is None:
3101             raise ExtractorError(u'Invalid URL: %s' % url)
3102
3103         post_url = mobj.group(0)
3104         video_id = mobj.group(1)
3105
3106         video_extension = 'flv'
3107
3108         # Step 1, Retrieve post webpage to extract further information
3109         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3110
3111         self.report_extraction(video_id)
3112
3113         # Extract update date
3114         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3115             webpage, u'upload date', fatal=False)
3116         if upload_date:
3117             # Convert timestring to a format suitable for filename
3118             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3119             upload_date = upload_date.strftime('%Y%m%d')
3120
3121         # Extract uploader
3122         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3123             webpage, u'uploader', fatal=False)
3124
3125         # Extract title
3126         # Get the first line for title
3127         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3128             webpage, 'title', default=u'NA')
3129
3130         # Step 2, Stimulate clicking the image box to launch video
3131         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3132             webpage, u'video page URL')
3133         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3134
3135         # Extract video links on video page
3136         """Extract video links of all sizes"""
3137         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3138         mobj = re.findall(pattern, webpage)
3139         if len(mobj) == 0:
3140             raise ExtractorError(u'Unable to extract video links')
3141
3142         # Sort in resolution
3143         links = sorted(mobj)
3144
3145         # Choose the lowest of the sort, i.e. highest resolution
3146         video_url = links[-1]
3147         # Only get the url. The resolution part in the tuple has no use anymore
3148         video_url = video_url[-1]
3149         # Treat escaped \u0026 style hex
3150         try:
3151             video_url = video_url.decode("unicode_escape")
3152         except AttributeError: # Python 3
3153             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3154
3155
3156         return [{
3157             'id':       video_id,
3158             'url':      video_url,
3159             'uploader': uploader,
3160             'upload_date':  upload_date,
3161             'title':    video_title,
3162             'ext':      video_extension,
3163         }]
3164
3165 class NBAIE(InfoExtractor):
3166     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3167     IE_NAME = u'nba'
3168
3169     def _real_extract(self, url):
3170         mobj = re.match(self._VALID_URL, url)
3171         if mobj is None:
3172             raise ExtractorError(u'Invalid URL: %s' % url)
3173
3174         video_id = mobj.group(1)
3175
3176         webpage = self._download_webpage(url, video_id)
3177
3178         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3179
3180         shortened_video_id = video_id.rpartition('/')[2]
3181         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3182             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3183
3184         # It isn't there in the HTML it returns to us
3185         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3186
3187         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3188
3189         info = {
3190             'id': shortened_video_id,
3191             'url': video_url,
3192             'ext': 'mp4',
3193             'title': title,
3194             # 'uploader_date': uploader_date,
3195             'description': description,
3196         }
3197         return [info]
3198
3199 class JustinTVIE(InfoExtractor):
3200     """Information extractor for justin.tv and twitch.tv"""
3201     # TODO: One broadcast may be split into multiple videos. The key
3202     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3203     # starts at 1 and increases. Can we treat all parts as one video?
3204
3205     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3206         (?:
3207             (?P<channelid>[^/]+)|
3208             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3209             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3210         )
3211         /?(?:\#.*)?$
3212         """
3213     _JUSTIN_PAGE_LIMIT = 100
3214     IE_NAME = u'justin.tv'
3215
3216     def report_download_page(self, channel, offset):
3217         """Report attempt to download a single page of videos."""
3218         self.to_screen(u'%s: Downloading video information from %d to %d' %
3219                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3220
3221     # Return count of items, list of *valid* items
3222     def _parse_page(self, url, video_id):
3223         webpage = self._download_webpage(url, video_id,
3224                                          u'Downloading video info JSON',
3225                                          u'unable to download video info JSON')
3226
3227         response = json.loads(webpage)
3228         if type(response) != list:
3229             error_text = response.get('error', 'unknown error')
3230             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3231         info = []
3232         for clip in response:
3233             video_url = clip['video_file_url']
3234             if video_url:
3235                 video_extension = os.path.splitext(video_url)[1][1:]
3236                 video_date = re.sub('-', '', clip['start_time'][:10])
3237                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3238                 video_id = clip['id']
3239                 video_title = clip.get('title', video_id)
3240                 info.append({
3241                     'id': video_id,
3242                     'url': video_url,
3243                     'title': video_title,
3244                     'uploader': clip.get('channel_name', video_uploader_id),
3245                     'uploader_id': video_uploader_id,
3246                     'upload_date': video_date,
3247                     'ext': video_extension,
3248                 })
3249         return (len(response), info)
3250
3251     def _real_extract(self, url):
3252         mobj = re.match(self._VALID_URL, url)
3253         if mobj is None:
3254             raise ExtractorError(u'invalid URL: %s' % url)
3255
3256         api_base = 'http://api.justin.tv'
3257         paged = False
3258         if mobj.group('channelid'):
3259             paged = True
3260             video_id = mobj.group('channelid')
3261             api = api_base + '/channel/archives/%s.json' % video_id
3262         elif mobj.group('chapterid'):
3263             chapter_id = mobj.group('chapterid')
3264
3265             webpage = self._download_webpage(url, chapter_id)
3266             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3267             if not m:
3268                 raise ExtractorError(u'Cannot find archive of a chapter')
3269             archive_id = m.group(1)
3270
3271             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3272             chapter_info_xml = self._download_webpage(api, chapter_id,
3273                                              note=u'Downloading chapter information',
3274                                              errnote=u'Chapter information download failed')
3275             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3276             for a in doc.findall('.//archive'):
3277                 if archive_id == a.find('./id').text:
3278                     break
3279             else:
3280                 raise ExtractorError(u'Could not find chapter in chapter information')
3281
3282             video_url = a.find('./video_file_url').text
3283             video_ext = video_url.rpartition('.')[2] or u'flv'
3284
3285             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3286             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3287                                    note='Downloading chapter metadata',
3288                                    errnote='Download of chapter metadata failed')
3289             chapter_info = json.loads(chapter_info_json)
3290
3291             bracket_start = int(doc.find('.//bracket_start').text)
3292             bracket_end = int(doc.find('.//bracket_end').text)
3293
3294             # TODO determine start (and probably fix up file)
3295             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3296             #video_url += u'?start=' + TODO:start_timestamp
3297             # bracket_start is 13290, but we want 51670615
3298             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3299                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3300
3301             info = {
3302                 'id': u'c' + chapter_id,
3303                 'url': video_url,
3304                 'ext': video_ext,
3305                 'title': chapter_info['title'],
3306                 'thumbnail': chapter_info['preview'],
3307                 'description': chapter_info['description'],
3308                 'uploader': chapter_info['channel']['display_name'],
3309                 'uploader_id': chapter_info['channel']['name'],
3310             }
3311             return [info]
3312         else:
3313             video_id = mobj.group('videoid')
3314             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3315
3316         self.report_extraction(video_id)
3317
3318         info = []
3319         offset = 0
3320         limit = self._JUSTIN_PAGE_LIMIT
3321         while True:
3322             if paged:
3323                 self.report_download_page(video_id, offset)
3324             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3325             page_count, page_info = self._parse_page(page_url, video_id)
3326             info.extend(page_info)
3327             if not paged or page_count != limit:
3328                 break
3329             offset += limit
3330         return info
3331
3332 class FunnyOrDieIE(InfoExtractor):
3333     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3334
3335     def _real_extract(self, url):
3336         mobj = re.match(self._VALID_URL, url)
3337         if mobj is None:
3338             raise ExtractorError(u'invalid URL: %s' % url)
3339
3340         video_id = mobj.group('id')
3341         webpage = self._download_webpage(url, video_id)
3342
3343         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3344             webpage, u'video URL', flags=re.DOTALL)
3345
3346         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3347             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3348
3349         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3350             webpage, u'description', fatal=False, flags=re.DOTALL)
3351
3352         info = {
3353             'id': video_id,
3354             'url': video_url,
3355             'ext': 'mp4',
3356             'title': title,
3357             'description': video_description,
3358         }
3359         return [info]
3360
3361 class SteamIE(InfoExtractor):
3362     _VALID_URL = r"""http://store\.steampowered\.com/
3363                 (agecheck/)?
3364                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3365                 (?P<gameID>\d+)/?
3366                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3367                 """
3368     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3369     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3370
3371     @classmethod
3372     def suitable(cls, url):
3373         """Receives a URL and returns True if suitable for this IE."""
3374         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3375
3376     def _real_extract(self, url):
3377         m = re.match(self._VALID_URL, url, re.VERBOSE)
3378         gameID = m.group('gameID')
3379
3380         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3381         webpage = self._download_webpage(videourl, gameID)
3382
3383         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3384             videourl = self._AGECHECK_TEMPLATE % gameID
3385             self.report_age_confirmation()
3386             webpage = self._download_webpage(videourl, gameID)
3387
3388         self.report_extraction(gameID)
3389         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3390                                              webpage, 'game title')
3391
3392         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3393         mweb = re.finditer(urlRE, webpage)
3394         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3395         titles = re.finditer(namesRE, webpage)
3396         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3397         thumbs = re.finditer(thumbsRE, webpage)
3398         videos = []
3399         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3400             video_id = vid.group('videoID')
3401             title = vtitle.group('videoName')
3402             video_url = vid.group('videoURL')
3403             video_thumb = thumb.group('thumbnail')
3404             if not video_url:
3405                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3406             info = {
3407                 'id':video_id,
3408                 'url':video_url,
3409                 'ext': 'flv',
3410                 'title': unescapeHTML(title),
3411                 'thumbnail': video_thumb
3412                   }
3413             videos.append(info)
3414         return [self.playlist_result(videos, gameID, game_title)]
3415
3416 class UstreamIE(InfoExtractor):
3417     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3418     IE_NAME = u'ustream'
3419
3420     def _real_extract(self, url):
3421         m = re.match(self._VALID_URL, url)
3422         video_id = m.group('videoID')
3423
3424         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3425         webpage = self._download_webpage(url, video_id)
3426
3427         self.report_extraction(video_id)
3428
3429         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3430             webpage, u'title')
3431
3432         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3433             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3434
3435         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3436             webpage, u'thumbnail', fatal=False)
3437
3438         info = {
3439                 'id': video_id,
3440                 'url': video_url,
3441                 'ext': 'flv',
3442                 'title': video_title,
3443                 'uploader': uploader,
3444                 'thumbnail': thumbnail,
3445                }
3446         return info
3447
3448 class WorldStarHipHopIE(InfoExtractor):
3449     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3450     IE_NAME = u'WorldStarHipHop'
3451
3452     def _real_extract(self, url):
3453         m = re.match(self._VALID_URL, url)
3454         video_id = m.group('id')
3455
3456         webpage_src = self._download_webpage(url, video_id)
3457
3458         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3459             webpage_src, u'video URL')
3460
3461         if 'mp4' in video_url:
3462             ext = 'mp4'
3463         else:
3464             ext = 'flv'
3465
3466         video_title = self._html_search_regex(r"<title>(.*)</title>",
3467             webpage_src, u'title')
3468
3469         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3470         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3471             webpage_src, u'thumbnail', fatal=False)
3472
3473         if not thumbnail:
3474             _title = r"""candytitles.*>(.*)</span>"""
3475             mobj = re.search(_title, webpage_src)
3476             if mobj is not None:
3477                 video_title = mobj.group(1)
3478
3479         results = [{
3480                     'id': video_id,
3481                     'url' : video_url,
3482                     'title' : video_title,
3483                     'thumbnail' : thumbnail,
3484                     'ext' : ext,
3485                     }]
3486         return results
3487
3488 class RBMARadioIE(InfoExtractor):
3489     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3490
3491     def _real_extract(self, url):
3492         m = re.match(self._VALID_URL, url)
3493         video_id = m.group('videoID')
3494
3495         webpage = self._download_webpage(url, video_id)
3496
3497         json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
3498             webpage, u'json data')
3499
3500         try:
3501             data = json.loads(json_data)
3502         except ValueError as e:
3503             raise ExtractorError(u'Invalid JSON: ' + str(e))
3504
3505         video_url = data['akamai_url'] + '&cbr=256'
3506         url_parts = compat_urllib_parse_urlparse(video_url)
3507         video_ext = url_parts.path.rpartition('.')[2]
3508         info = {
3509                 'id': video_id,
3510                 'url': video_url,
3511                 'ext': video_ext,
3512                 'title': data['title'],
3513                 'description': data.get('teaser_text'),
3514                 'location': data.get('country_of_origin'),
3515                 'uploader': data.get('host', {}).get('name'),
3516                 'uploader_id': data.get('host', {}).get('slug'),
3517                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3518                 'duration': data.get('duration'),
3519         }
3520         return [info]
3521
3522
3523 class YouPornIE(InfoExtractor):
3524     """Information extractor for youporn.com."""
3525     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3526
3527     def _print_formats(self, formats):
3528         """Print all available formats"""
3529         print(u'Available formats:')
3530         print(u'ext\t\tformat')
3531         print(u'---------------------------------')
3532         for format in formats:
3533             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3534
3535     def _specific(self, req_format, formats):
3536         for x in formats:
3537             if(x["format"]==req_format):
3538                 return x
3539         return None
3540
3541     def _real_extract(self, url):
3542         mobj = re.match(self._VALID_URL, url)
3543         if mobj is None:
3544             raise ExtractorError(u'Invalid URL: %s' % url)
3545         video_id = mobj.group('videoid')
3546
3547         req = compat_urllib_request.Request(url)
3548         req.add_header('Cookie', 'age_verified=1')
3549         webpage = self._download_webpage(req, video_id)
3550
3551         # Get JSON parameters
3552         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3553         try:
3554             params = json.loads(json_params)
3555         except:
3556             raise ExtractorError(u'Invalid JSON')
3557
3558         self.report_extraction(video_id)
3559         try:
3560             video_title = params['title']
3561             upload_date = unified_strdate(params['release_date_f'])
3562             video_description = params['description']
3563             video_uploader = params['submitted_by']
3564             thumbnail = params['thumbnails'][0]['image']
3565         except KeyError:
3566             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3567
3568         # Get all of the formats available
3569         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3570         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3571             webpage, u'download list').strip()
3572
3573         # Get all of the links from the page
3574         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3575         links = re.findall(LINK_RE, download_list_html)
3576         if(len(links) == 0):
3577             raise ExtractorError(u'ERROR: no known formats available for video')
3578
3579         self.to_screen(u'Links found: %d' % len(links))
3580
3581         formats = []
3582         for link in links:
3583
3584             # A link looks like this:
3585             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3586             # A path looks like this:
3587             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3588             video_url = unescapeHTML( link )
3589             path = compat_urllib_parse_urlparse( video_url ).path
3590             extension = os.path.splitext( path )[1][1:]
3591             format = path.split('/')[4].split('_')[:2]
3592             size = format[0]
3593             bitrate = format[1]
3594             format = "-".join( format )
3595             # title = u'%s-%s-%s' % (video_title, size, bitrate)
3596
3597             formats.append({
3598                 'id': video_id,
3599                 'url': video_url,
3600                 'uploader': video_uploader,
3601                 'upload_date': upload_date,
3602                 'title': video_title,
3603                 'ext': extension,
3604                 'format': format,
3605                 'thumbnail': thumbnail,
3606                 'description': video_description
3607             })
3608
3609         if self._downloader.params.get('listformats', None):
3610             self._print_formats(formats)
3611             return
3612
3613         req_format = self._downloader.params.get('format', None)
3614         self.to_screen(u'Format: %s' % req_format)
3615
3616         if req_format is None or req_format == 'best':
3617             return [formats[0]]
3618         elif req_format == 'worst':
3619             return [formats[-1]]
3620         elif req_format in ('-1', 'all'):
3621             return formats
3622         else:
3623             format = self._specific( req_format, formats )
3624             if result is None:
3625                 raise ExtractorError(u'Requested format not available')
3626             return [format]
3627
3628
3629
3630 class PornotubeIE(InfoExtractor):
3631     """Information extractor for pornotube.com."""
3632     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3633
3634     def _real_extract(self, url):
3635         mobj = re.match(self._VALID_URL, url)
3636         if mobj is None:
3637             raise ExtractorError(u'Invalid URL: %s' % url)
3638
3639         video_id = mobj.group('videoid')
3640         video_title = mobj.group('title')
3641
3642         # Get webpage content
3643         webpage = self._download_webpage(url, video_id)
3644
3645         # Get the video URL
3646         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3647         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3648         video_url = compat_urllib_parse.unquote(video_url)
3649
3650         #Get the uploaded date
3651         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3652         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3653         if upload_date: upload_date = unified_strdate(upload_date)
3654
3655         info = {'id': video_id,
3656                 'url': video_url,
3657                 'uploader': None,
3658                 'upload_date': upload_date,
3659                 'title': video_title,
3660                 'ext': 'flv',
3661                 'format': 'flv'}
3662
3663         return [info]
3664
3665 class YouJizzIE(InfoExtractor):
3666     """Information extractor for youjizz.com."""
3667     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3668
3669     def _real_extract(self, url):
3670         mobj = re.match(self._VALID_URL, url)
3671         if mobj is None:
3672             raise ExtractorError(u'Invalid URL: %s' % url)
3673
3674         video_id = mobj.group('videoid')
3675
3676         # Get webpage content
3677         webpage = self._download_webpage(url, video_id)
3678
3679         # Get the video title
3680         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3681             webpage, u'title').strip()
3682
3683         # Get the embed page
3684         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3685         if result is None:
3686             raise ExtractorError(u'ERROR: unable to extract embed page')
3687
3688         embed_page_url = result.group(0).strip()
3689         video_id = result.group('videoid')
3690
3691         webpage = self._download_webpage(embed_page_url, video_id)
3692
3693         # Get the video URL
3694         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3695             webpage, u'video URL')
3696
3697         info = {'id': video_id,
3698                 'url': video_url,
3699                 'title': video_title,
3700                 'ext': 'flv',
3701                 'format': 'flv',
3702                 'player_url': embed_page_url}
3703
3704         return [info]
3705
3706 class EightTracksIE(InfoExtractor):
3707     IE_NAME = '8tracks'
3708     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3709
3710     def _real_extract(self, url):
3711         mobj = re.match(self._VALID_URL, url)
3712         if mobj is None:
3713             raise ExtractorError(u'Invalid URL: %s' % url)
3714         playlist_id = mobj.group('id')
3715
3716         webpage = self._download_webpage(url, playlist_id)
3717
3718         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3719         data = json.loads(json_like)
3720
3721         session = str(random.randint(0, 1000000000))
3722         mix_id = data['id']
3723         track_count = data['tracks_count']
3724         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3725         next_url = first_url
3726         res = []
3727         for i in itertools.count():
3728             api_json = self._download_webpage(next_url, playlist_id,
3729                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3730                 errnote=u'Failed to download song information')
3731             api_data = json.loads(api_json)
3732             track_data = api_data[u'set']['track']
3733             info = {
3734                 'id': track_data['id'],
3735                 'url': track_data['track_file_stream_url'],
3736                 'title': track_data['performer'] + u' - ' + track_data['name'],
3737                 'raw_title': track_data['name'],
3738                 'uploader_id': data['user']['login'],
3739                 'ext': 'm4a',
3740             }
3741             res.append(info)
3742             if api_data['set']['at_last_track']:
3743                 break
3744             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3745         return res
3746
3747 class KeekIE(InfoExtractor):
3748     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3749     IE_NAME = u'keek'
3750
3751     def _real_extract(self, url):
3752         m = re.match(self._VALID_URL, url)
3753         video_id = m.group('videoID')
3754
3755         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3756         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3757         webpage = self._download_webpage(url, video_id)
3758
3759         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3760             webpage, u'title')
3761
3762         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3763             webpage, u'uploader', fatal=False)
3764
3765         info = {
3766                 'id': video_id,
3767                 'url': video_url,
3768                 'ext': 'mp4',
3769                 'title': video_title,
3770                 'thumbnail': thumbnail,
3771                 'uploader': uploader
3772         }
3773         return [info]
3774
3775 class TEDIE(InfoExtractor):
3776     _VALID_URL=r'''http://www\.ted\.com/
3777                    (
3778                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3779                         |
3780                         ((?P<type_talk>talks)) # We have a simple talk
3781                    )
3782                    (/lang/(.*?))? # The url may contain the language
3783                    /(?P<name>\w+) # Here goes the name and then ".html"
3784                    '''
3785
3786     @classmethod
3787     def suitable(cls, url):
3788         """Receives a URL and returns True if suitable for this IE."""
3789         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3790
3791     def _real_extract(self, url):
3792         m=re.match(self._VALID_URL, url, re.VERBOSE)
3793         if m.group('type_talk'):
3794             return [self._talk_info(url)]
3795         else :
3796             playlist_id=m.group('playlist_id')
3797             name=m.group('name')
3798             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3799             return [self._playlist_videos_info(url,name,playlist_id)]
3800
3801     def _talk_video_link(self,mediaSlug):
3802         '''Returns the video link for that mediaSlug'''
3803         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3804
3805     def _playlist_videos_info(self,url,name,playlist_id=0):
3806         '''Returns the videos of the playlist'''
3807         video_RE=r'''
3808                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3809                      ([.\s]*?)data-playlist_item_id="(\d+)"
3810                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3811                      '''
3812         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3813         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3814         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3815         m_names=re.finditer(video_name_RE,webpage)
3816
3817         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3818         m_playlist = re.search(playlist_RE, webpage)
3819         playlist_title = m_playlist.group('playlist_title')
3820
3821         playlist_entries = []
3822         for m_video, m_name in zip(m_videos,m_names):
3823             video_id=m_video.group('video_id')
3824             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3825             playlist_entries.append(self.url_result(talk_url, 'TED'))
3826         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3827
3828     def _talk_info(self, url, video_id=0):
3829         """Return the video for the talk in the url"""
3830         m=re.match(self._VALID_URL, url,re.VERBOSE)
3831         videoName=m.group('name')
3832         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3833         # If the url includes the language we get the title translated
3834         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3835         title=re.search(title_RE, webpage).group('title')
3836         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3837                         "id":(?P<videoID>[\d]+).*?
3838                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3839         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3840         thumb_match=re.search(thumb_RE,webpage)
3841         info_match=re.search(info_RE,webpage,re.VERBOSE)
3842         video_id=info_match.group('videoID')
3843         mediaSlug=info_match.group('mediaSlug')
3844         video_url=self._talk_video_link(mediaSlug)
3845         info = {
3846                 'id': video_id,
3847                 'url': video_url,
3848                 'ext': 'mp4',
3849                 'title': title,
3850                 'thumbnail': thumb_match.group('thumbnail')
3851                 }
3852         return info
3853
3854 class MySpassIE(InfoExtractor):
3855     _VALID_URL = r'http://www.myspass.de/.*'
3856
3857     def _real_extract(self, url):
3858         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3859
3860         # video id is the last path element of the URL
3861         # usually there is a trailing slash, so also try the second but last
3862         url_path = compat_urllib_parse_urlparse(url).path
3863         url_parent_path, video_id = os.path.split(url_path)
3864         if not video_id:
3865             _, video_id = os.path.split(url_parent_path)
3866
3867         # get metadata
3868         metadata_url = META_DATA_URL_TEMPLATE % video_id
3869         metadata_text = self._download_webpage(metadata_url, video_id)
3870         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3871
3872         # extract values from metadata
3873         url_flv_el = metadata.find('url_flv')
3874         if url_flv_el is None:
3875             raise ExtractorError(u'Unable to extract download url')
3876         video_url = url_flv_el.text
3877         extension = os.path.splitext(video_url)[1][1:]
3878         title_el = metadata.find('title')
3879         if title_el is None:
3880             raise ExtractorError(u'Unable to extract title')
3881         title = title_el.text
3882         format_id_el = metadata.find('format_id')
3883         if format_id_el is None:
3884             format = ext
3885         else:
3886             format = format_id_el.text
3887         description_el = metadata.find('description')
3888         if description_el is not None:
3889             description = description_el.text
3890         else:
3891             description = None
3892         imagePreview_el = metadata.find('imagePreview')
3893         if imagePreview_el is not None:
3894             thumbnail = imagePreview_el.text
3895         else:
3896             thumbnail = None
3897         info = {
3898             'id': video_id,
3899             'url': video_url,
3900             'title': title,
3901             'ext': extension,
3902             'format': format,
3903             'thumbnail': thumbnail,
3904             'description': description
3905         }
3906         return [info]
3907
3908 class SpiegelIE(InfoExtractor):
3909     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3910
3911     def _real_extract(self, url):
3912         m = re.match(self._VALID_URL, url)
3913         video_id = m.group('videoID')
3914
3915         webpage = self._download_webpage(url, video_id)
3916
3917         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3918             webpage, u'title')
3919
3920         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3921         xml_code = self._download_webpage(xml_url, video_id,
3922                     note=u'Downloading XML', errnote=u'Failed to download XML')
3923
3924         idoc = xml.etree.ElementTree.fromstring(xml_code)
3925         last_type = idoc[-1]
3926         filename = last_type.findall('./filename')[0].text
3927         duration = float(last_type.findall('./duration')[0].text)
3928
3929         video_url = 'http://video2.spiegel.de/flash/' + filename
3930         video_ext = filename.rpartition('.')[2]
3931         info = {
3932             'id': video_id,
3933             'url': video_url,
3934             'ext': video_ext,
3935             'title': video_title,
3936             'duration': duration,
3937         }
3938         return [info]
3939
3940 class LiveLeakIE(InfoExtractor):
3941
3942     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3943     IE_NAME = u'liveleak'
3944
3945     def _real_extract(self, url):
3946         mobj = re.match(self._VALID_URL, url)
3947         if mobj is None:
3948             raise ExtractorError(u'Invalid URL: %s' % url)
3949
3950         video_id = mobj.group('video_id')
3951
3952         webpage = self._download_webpage(url, video_id)
3953
3954         video_url = self._search_regex(r'file: "(.*?)",',
3955             webpage, u'video URL')
3956
3957         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3958             webpage, u'title').replace('LiveLeak.com -', '').strip()
3959
3960         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3961             webpage, u'description', fatal=False)
3962
3963         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3964             webpage, u'uploader', fatal=False)
3965
3966         info = {
3967             'id':  video_id,
3968             'url': video_url,
3969             'ext': 'mp4',
3970             'title': video_title,
3971             'description': video_description,
3972             'uploader': video_uploader
3973         }
3974
3975         return [info]
3976
3977 class ARDIE(InfoExtractor):
3978     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3979     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3980     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3981
3982     def _real_extract(self, url):
3983         # determine video id from url
3984         m = re.match(self._VALID_URL, url)
3985
3986         numid = re.search(r'documentId=([0-9]+)', url)
3987         if numid:
3988             video_id = numid.group(1)
3989         else:
3990             video_id = m.group('video_id')
3991
3992         # determine title and media streams from webpage
3993         html = self._download_webpage(url, video_id)
3994         title = re.search(self._TITLE, html).group('title')
3995         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3996         if not streams:
3997             assert '"fsk"' in html
3998             raise ExtractorError(u'This video is only available after 8:00 pm')
3999
4000         # choose default media type and highest quality for now
4001         stream = max([s for s in streams if int(s["media_type"]) == 0],
4002                      key=lambda s: int(s["quality"]))
4003
4004         # there's two possibilities: RTMP stream or HTTP download
4005         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4006         if stream['rtmp_url']:
4007             self.to_screen(u'RTMP download detected')
4008             assert stream['video_url'].startswith('mp4:')
4009             info["url"] = stream["rtmp_url"]
4010             info["play_path"] = stream['video_url']
4011         else:
4012             assert stream["video_url"].endswith('.mp4')
4013             info["url"] = stream["video_url"]
4014         return [info]
4015
4016 class ZDFIE(InfoExtractor):
4017     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4018     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4019     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4020     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4021     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4022
4023     def _real_extract(self, url):
4024         mobj = re.match(self._VALID_URL, url)
4025         if mobj is None:
4026             raise ExtractorError(u'Invalid URL: %s' % url)
4027         video_id = mobj.group('video_id')
4028
4029         html = self._download_webpage(url, video_id)
4030         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4031         if streams is None:
4032             raise ExtractorError(u'No media url found.')
4033
4034         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4035         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4036         # choose first/default media type and highest quality for now
4037         for s in streams:        #find 300 - dsl1000mbit
4038             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4039                 stream_=s
4040                 break
4041         for s in streams:        #find veryhigh - dsl2000mbit
4042             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4043                 stream_=s
4044                 break
4045         if stream_ is None:
4046             raise ExtractorError(u'No stream found.')
4047
4048         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4049
4050         self.report_extraction(video_id)
4051         mobj = re.search(self._TITLE, html)
4052         if mobj is None:
4053             raise ExtractorError(u'Cannot extract title')
4054         title = unescapeHTML(mobj.group('title'))
4055
4056         mobj = re.search(self._MMS_STREAM, media_link)
4057         if mobj is None:
4058             mobj = re.search(self._RTSP_STREAM, media_link)
4059             if mobj is None:
4060                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4061         mms_url = mobj.group('video_url')
4062
4063         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4064         if mobj is None:
4065             raise ExtractorError(u'Cannot extract extention')
4066         ext = mobj.group('ext')
4067
4068         return [{'id': video_id,
4069                  'url': mms_url,
4070                  'title': title,
4071                  'ext': ext
4072                  }]
4073
4074 class TumblrIE(InfoExtractor):
4075     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4076
4077     def _real_extract(self, url):
4078         m_url = re.match(self._VALID_URL, url)
4079         video_id = m_url.group('id')
4080         blog = m_url.group('blog_name')
4081
4082         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4083         webpage = self._download_webpage(url, video_id)
4084
4085         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4086         video = re.search(re_video, webpage)
4087         if video is None:
4088            raise ExtractorError(u'Unable to extract video')
4089         video_url = video.group('video_url')
4090         ext = video.group('ext')
4091
4092         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4093             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4094         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4095
4096         # The only place where you can get a title, it's not complete,
4097         # but searching in other places doesn't work for all videos
4098         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4099             webpage, u'title', flags=re.DOTALL)
4100
4101         return [{'id': video_id,
4102                  'url': video_url,
4103                  'title': video_title,
4104                  'thumbnail': video_thumbnail,
4105                  'ext': ext
4106                  }]
4107
4108 class BandcampIE(InfoExtractor):
4109     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4110
4111     def _real_extract(self, url):
4112         mobj = re.match(self._VALID_URL, url)
4113         title = mobj.group('title')
4114         webpage = self._download_webpage(url, title)
4115         # We get the link to the free download page
4116         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4117         if m_download is None:
4118             raise ExtractorError(u'No free songs found')
4119
4120         download_link = m_download.group(1)
4121         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4122                        webpage, re.MULTILINE|re.DOTALL).group('id')
4123
4124         download_webpage = self._download_webpage(download_link, id,
4125                                                   'Downloading free downloads page')
4126         # We get the dictionary of the track from some javascrip code
4127         info = re.search(r'items: (.*?),$',
4128                          download_webpage, re.MULTILINE).group(1)
4129         info = json.loads(info)[0]
4130         # We pick mp3-320 for now, until format selection can be easily implemented.
4131         mp3_info = info[u'downloads'][u'mp3-320']
4132         # If we try to use this url it says the link has expired
4133         initial_url = mp3_info[u'url']
4134         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4135         m_url = re.match(re_url, initial_url)
4136         #We build the url we will use to get the final track url
4137         # This url is build in Bandcamp in the script download_bunde_*.js
4138         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4139         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4140         # If we could correctly generate the .rand field the url would be
4141         #in the "download_url" key
4142         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4143
4144         track_info = {'id':id,
4145                       'title' : info[u'title'],
4146                       'ext' :   'mp3',
4147                       'url' :   final_url,
4148                       'thumbnail' : info[u'thumb_url'],
4149                       'uploader' :  info[u'artist']
4150                       }
4151
4152         return [track_info]
4153
4154 class RedTubeIE(InfoExtractor):
4155     """Information Extractor for redtube"""
4156     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4157
4158     def _real_extract(self,url):
4159         mobj = re.match(self._VALID_URL, url)
4160         if mobj is None:
4161             raise ExtractorError(u'Invalid URL: %s' % url)
4162
4163         video_id = mobj.group('id')
4164         video_extension = 'mp4'
4165         webpage = self._download_webpage(url, video_id)
4166
4167         self.report_extraction(video_id)
4168
4169         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4170             webpage, u'video URL')
4171
4172         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4173             webpage, u'title')
4174
4175         return [{
4176             'id':       video_id,
4177             'url':      video_url,
4178             'ext':      video_extension,
4179             'title':    video_title,
4180         }]
4181
4182 class InaIE(InfoExtractor):
4183     """Information Extractor for Ina.fr"""
4184     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4185
4186     def _real_extract(self,url):
4187         mobj = re.match(self._VALID_URL, url)
4188
4189         video_id = mobj.group('id')
4190         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4191         video_extension = 'mp4'
4192         webpage = self._download_webpage(mrss_url, video_id)
4193
4194         self.report_extraction(video_id)
4195
4196         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4197             webpage, u'video URL')
4198
4199         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4200             webpage, u'title')
4201
4202         return [{
4203             'id':       video_id,
4204             'url':      video_url,
4205             'ext':      video_extension,
4206             'title':    video_title,
4207         }]
4208
4209 class HowcastIE(InfoExtractor):
4210     """Information Extractor for Howcast.com"""
4211     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4212
4213     def _real_extract(self, url):
4214         mobj = re.match(self._VALID_URL, url)
4215
4216         video_id = mobj.group('id')
4217         webpage_url = 'http://www.howcast.com/videos/' + video_id
4218         webpage = self._download_webpage(webpage_url, video_id)
4219
4220         self.report_extraction(video_id)
4221
4222         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4223             webpage, u'video URL')
4224
4225         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4226             webpage, u'title')
4227
4228         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4229             webpage, u'description', fatal=False)
4230
4231         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4232             webpage, u'thumbnail', fatal=False)
4233
4234         return [{
4235             'id':       video_id,
4236             'url':      video_url,
4237             'ext':      'mp4',
4238             'title':    video_title,
4239             'description': video_description,
4240             'thumbnail': thumbnail,
4241         }]
4242
4243 class VineIE(InfoExtractor):
4244     """Information Extractor for Vine.co"""
4245     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4246
4247     def _real_extract(self, url):
4248         mobj = re.match(self._VALID_URL, url)
4249
4250         video_id = mobj.group('id')
4251         webpage_url = 'https://vine.co/v/' + video_id
4252         webpage = self._download_webpage(webpage_url, video_id)
4253
4254         self.report_extraction(video_id)
4255
4256         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4257             webpage, u'video URL')
4258
4259         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4260             webpage, u'title')
4261
4262         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4263             webpage, u'thumbnail', fatal=False)
4264
4265         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4266             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4267
4268         return [{
4269             'id':        video_id,
4270             'url':       video_url,
4271             'ext':       'mp4',
4272             'title':     video_title,
4273             'thumbnail': thumbnail,
4274             'uploader':  uploader,
4275         }]
4276
4277 class FlickrIE(InfoExtractor):
4278     """Information Extractor for Flickr videos"""
4279     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4280
4281     def _real_extract(self, url):
4282         mobj = re.match(self._VALID_URL, url)
4283
4284         video_id = mobj.group('id')
4285         video_uploader_id = mobj.group('uploader_id')
4286         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4287         webpage = self._download_webpage(webpage_url, video_id)
4288
4289         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4290
4291         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4292         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4293
4294         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4295             first_xml, u'node_id')
4296
4297         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4298         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4299
4300         self.report_extraction(video_id)
4301
4302         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4303         if mobj is None:
4304             raise ExtractorError(u'Unable to extract video url')
4305         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4306
4307         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4308             webpage, u'video title')
4309
4310         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4311             webpage, u'description', fatal=False)
4312
4313         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4314             webpage, u'thumbnail', fatal=False)
4315
4316         return [{
4317             'id':          video_id,
4318             'url':         video_url,
4319             'ext':         'mp4',
4320             'title':       video_title,
4321             'description': video_description,
4322             'thumbnail':   thumbnail,
4323             'uploader_id': video_uploader_id,
4324         }]
4325
4326 class TeamcocoIE(InfoExtractor):
4327     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4328
4329     def _real_extract(self, url):
4330         mobj = re.match(self._VALID_URL, url)
4331         if mobj is None:
4332             raise ExtractorError(u'Invalid URL: %s' % url)
4333         url_title = mobj.group('url_title')
4334         webpage = self._download_webpage(url, url_title)
4335
4336         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4337             webpage, u'video id')
4338
4339         self.report_extraction(video_id)
4340
4341         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4342             webpage, u'title')
4343
4344         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4345             webpage, u'thumbnail', fatal=False)
4346
4347         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4348             webpage, u'description', fatal=False)
4349
4350         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4351         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4352
4353         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4354             data, u'video URL')
4355
4356         return [{
4357             'id':          video_id,
4358             'url':         video_url,
4359             'ext':         'mp4',
4360             'title':       video_title,
4361             'thumbnail':   thumbnail,
4362             'description': video_description,
4363         }]
4364
4365 class XHamsterIE(InfoExtractor):
4366     """Information Extractor for xHamster"""
4367     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4368
4369     def _real_extract(self,url):
4370         mobj = re.match(self._VALID_URL, url)
4371
4372         video_id = mobj.group('id')
4373         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4374         webpage = self._download_webpage(mrss_url, video_id)
4375
4376         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4377         if mobj is None:
4378             raise ExtractorError(u'Unable to extract media URL')
4379         if len(mobj.group('server')) == 0:
4380             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4381         else:
4382             video_url = mobj.group('server')+'/key='+mobj.group('file')
4383         video_extension = video_url.split('.')[-1]
4384
4385         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4386             webpage, u'title')
4387
4388         # Can't see the description anywhere in the UI
4389         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4390         #     webpage, u'description', fatal=False)
4391         # if video_description: video_description = unescapeHTML(video_description)
4392
4393         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4394         if mobj:
4395             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4396         else:
4397             video_upload_date = None
4398             self._downloader.report_warning(u'Unable to extract upload date')
4399
4400         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4401             webpage, u'uploader id', default=u'anonymous')
4402
4403         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4404             webpage, u'thumbnail', fatal=False)
4405
4406         return [{
4407             'id':       video_id,
4408             'url':      video_url,
4409             'ext':      video_extension,
4410             'title':    video_title,
4411             # 'description': video_description,
4412             'upload_date': video_upload_date,
4413             'uploader_id': video_uploader_id,
4414             'thumbnail': video_thumbnail
4415         }]
4416
4417 class HypemIE(InfoExtractor):
4418     """Information Extractor for hypem"""
4419     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4420
4421     def _real_extract(self, url):
4422         mobj = re.match(self._VALID_URL, url)
4423         if mobj is None:
4424             raise ExtractorError(u'Invalid URL: %s' % url)
4425         track_id = mobj.group(1)
4426
4427         data = { 'ax': 1, 'ts': time.time() }
4428         data_encoded = compat_urllib_parse.urlencode(data)
4429         complete_url = url + "?" + data_encoded
4430         request = compat_urllib_request.Request(complete_url)
4431         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4432         cookie = urlh.headers.get('Set-Cookie', '')
4433
4434         self.report_extraction(track_id)
4435
4436         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4437             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4438         try:
4439             track_list = json.loads(html_tracks)
4440             track = track_list[u'tracks'][0]
4441         except ValueError:
4442             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4443
4444         key = track[u"key"]
4445         track_id = track[u"id"]
4446         artist = track[u"artist"]
4447         title = track[u"song"]
4448
4449         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4450         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4451         request.add_header('cookie', cookie)
4452         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4453         try:
4454             song_data = json.loads(song_data_json)
4455         except ValueError:
4456             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4457         final_url = song_data[u"url"]
4458
4459         return [{
4460             'id':       track_id,
4461             'url':      final_url,
4462             'ext':      "mp3",
4463             'title':    title,
4464             'artist':   artist,
4465         }]
4466
4467 class Vbox7IE(InfoExtractor):
4468     """Information Extractor for Vbox7"""
4469     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4470
4471     def _real_extract(self,url):
4472         mobj = re.match(self._VALID_URL, url)
4473         if mobj is None:
4474             raise ExtractorError(u'Invalid URL: %s' % url)
4475         video_id = mobj.group(1)
4476
4477         redirect_page, urlh = self._download_webpage_handle(url, video_id)
4478         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4479         redirect_url = urlh.geturl() + new_location
4480         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4481
4482         title = self._html_search_regex(r'<title>(.*)</title>',
4483             webpage, u'title').split('/')[0].strip()
4484
4485         ext = "flv"
4486         info_url = "http://vbox7.com/play/magare.do"
4487         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4488         info_request = compat_urllib_request.Request(info_url, data)
4489         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4490         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4491         if info_response is None:
4492             raise ExtractorError(u'Unable to extract the media url')
4493         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4494
4495         return [{
4496             'id':        video_id,
4497             'url':       final_url,
4498             'ext':       ext,
4499             'title':     title,
4500             'thumbnail': thumbnail_url,
4501         }]
4502
4503 class GametrailersIE(InfoExtractor):
4504     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4505
4506     def _real_extract(self, url):
4507         mobj = re.match(self._VALID_URL, url)
4508         if mobj is None:
4509             raise ExtractorError(u'Invalid URL: %s' % url)
4510         video_id = mobj.group('id')
4511         video_type = mobj.group('type')
4512         webpage = self._download_webpage(url, video_id)
4513         if video_type == 'full-episodes':
4514             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4515         else:
4516             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4517         mgid = self._search_regex(mgid_re, webpage, u'mgid')
4518         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4519
4520         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4521                                            video_id, u'Downloading video info')
4522         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4523                                                video_id, u'Downloading video urls info')
4524
4525         self.report_extraction(video_id)
4526         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4527                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4528                       <image>.*
4529                         <url>(?P<thumb>.*?)</url>.*
4530                       </image>'''
4531
4532         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4533         if m_info is None:
4534             raise ExtractorError(u'Unable to extract video info')
4535         video_title = m_info.group('title')
4536         video_description = m_info.group('description')
4537         video_thumb = m_info.group('thumb')
4538
4539         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4540         if m_urls is None or len(m_urls) == 0:
4541             raise ExtractError(u'Unable to extrat video url')
4542         # They are sorted from worst to best quality
4543         video_url = m_urls[-1].group('url')
4544
4545         return {'url':         video_url,
4546                 'id':          video_id,
4547                 'title':       video_title,
4548                 # Videos are actually flv not mp4
4549                 'ext':         'flv',
4550                 'thumbnail':   video_thumb,
4551                 'description': video_description,
4552                 }
4553
4554 def gen_extractors():
4555     """ Return a list of an instance of every supported extractor.
4556     The order does matter; the first extractor matched is the one handling the URL.
4557     """
4558     return [
4559         YoutubePlaylistIE(),
4560         YoutubeChannelIE(),
4561         YoutubeUserIE(),
4562         YoutubeSearchIE(),
4563         YoutubeIE(),
4564         MetacafeIE(),
4565         DailymotionIE(),
4566         GoogleSearchIE(),
4567         PhotobucketIE(),
4568         YahooIE(),
4569         YahooSearchIE(),
4570         DepositFilesIE(),
4571         FacebookIE(),
4572         BlipTVIE(),
4573         BlipTVUserIE(),
4574         VimeoIE(),
4575         MyVideoIE(),
4576         ComedyCentralIE(),
4577         EscapistIE(),
4578         CollegeHumorIE(),
4579         XVideosIE(),
4580         SoundcloudSetIE(),
4581         SoundcloudIE(),
4582         InfoQIE(),
4583         MixcloudIE(),
4584         StanfordOpenClassroomIE(),
4585         MTVIE(),
4586         YoukuIE(),
4587         XNXXIE(),
4588         YouJizzIE(),
4589         PornotubeIE(),
4590         YouPornIE(),
4591         GooglePlusIE(),
4592         ArteTvIE(),
4593         NBAIE(),
4594         WorldStarHipHopIE(),
4595         JustinTVIE(),
4596         FunnyOrDieIE(),
4597         SteamIE(),
4598         UstreamIE(),
4599         RBMARadioIE(),
4600         EightTracksIE(),
4601         KeekIE(),
4602         TEDIE(),
4603         MySpassIE(),
4604         SpiegelIE(),
4605         LiveLeakIE(),
4606         ARDIE(),
4607         ZDFIE(),
4608         TumblrIE(),
4609         BandcampIE(),
4610         RedTubeIE(),
4611         InaIE(),
4612         HowcastIE(),
4613         VineIE(),
4614         FlickrIE(),
4615         TeamcocoIE(),
4616         XHamsterIE(),
4617         HypemIE(),
4618         Vbox7IE(),
4619         GametrailersIE(),
4620         GenericIE()
4621     ]
4622
4623 def get_info_extractor(ie_name):
4624     """Returns the info extractor class with the given ie_name"""
4625     return globals()[ie_name+'IE']