youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 195         """
 196         Perform a regex search on the given string, using a single or a list of
 197         patterns returning the first matching group.
 198         In case of failure return a default value or raise a WARNING or a
 199         ExtractorError, depending on fatal, specifying the field name.
 200         """
 201         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 202             mobj = re.search(pattern, string, flags)
 203         else:
 204             for p in pattern:
 205                 mobj = re.search(p, string, flags)
 206                 if mobj: break
 207
 208         if sys.stderr.isatty() and os.name != 'nt':
 209             _name = u'\033[0;34m%s\033[0m' % name
 210         else:
 211             _name = name
 212
 213         if mobj:
 214             # return the first matching group
 215             return next(g for g in mobj.groups() if g is not None)
 216         elif default is not None:
 217             return default
 218         elif fatal:
 219             raise ExtractorError(u'Unable to extract %s' % _name)
 220         else:
 221             self._downloader.report_warning(u'unable to extract %s; '
 222                 u'please report this issue on GitHub.' % _name)
 223             return None
 224
 225     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 226         """
 227         Like _search_regex, but strips HTML tags and unescapes entities.
 228         """
 229         res = self._search_regex(pattern, string, name, default, fatal, flags)
 230         if res:
 231             return clean_html(res).strip()
 232         else:
 233             return res
 234
 235 class SearchInfoExtractor(InfoExtractor):
 236     """
 237     Base class for paged search queries extractors.
 238     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 239     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 240     """
 241
 242     @classmethod
 243     def _make_valid_url(cls):
 244         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 245
 246     @classmethod
 247     def suitable(cls, url):
 248         return re.match(cls._make_valid_url(), url) is not None
 249
 250     def _real_extract(self, query):
 251         mobj = re.match(self._make_valid_url(), query)
 252         if mobj is None:
 253             raise ExtractorError(u'Invalid search query "%s"' % query)
 254
 255         prefix = mobj.group('prefix')
 256         query = mobj.group('query')
 257         if prefix == '':
 258             return self._get_n_results(query, 1)
 259         elif prefix == 'all':
 260             return self._get_n_results(query, self._MAX_RESULTS)
 261         else:
 262             n = int(prefix)
 263             if n <= 0:
 264                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 265             elif n > self._MAX_RESULTS:
 266                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 267                 n = self._MAX_RESULTS
 268             return self._get_n_results(query, n)
 269
 270     def _get_n_results(self, query, n):
 271         """Get a specified number of results for a query"""
 272         raise NotImplementedError("This method must be implemented by sublclasses")
 273
 274
 275 class YoutubeIE(InfoExtractor):
 276     """Information extractor for youtube.com."""
 277
 278     _VALID_URL = r"""^
 279                      (
 280                          (?:https?://)?                                       # http(s):// (optional)
 281                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 282                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 283                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 284                          (?:                                                  # the various things that can precede the ID:
 285                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 286                              |(?:                                             # or the v= param in all its forms
 287                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 288                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 289                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 290                                  v=
 291                              )
 292                          )?                                                   # optional -> youtube.com/xxxx is OK
 293                      )?                                                       # all until now is optional -> you can pass the naked ID
 294                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 295                      (?(1).+)?                                                # if we found the ID, everything can follow
 296                      $"""
 297     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 298     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 299     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 300     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 301     _NETRC_MACHINE = 'youtube'
 302     # Listed in order of quality
 303     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 304     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 305     _video_extensions = {
 306         '13': '3gp',
 307         '17': 'mp4',
 308         '18': 'mp4',
 309         '22': 'mp4',
 310         '37': 'mp4',
 311         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 312         '43': 'webm',
 313         '44': 'webm',
 314         '45': 'webm',
 315         '46': 'webm',
 316     }
 317     _video_dimensions = {
 318         '5': '240x400',
 319         '6': '???',
 320         '13': '???',
 321         '17': '144x176',
 322         '18': '360x640',
 323         '22': '720x1280',
 324         '34': '360x640',
 325         '35': '480x854',
 326         '37': '1080x1920',
 327         '38': '3072x4096',
 328         '43': '360x640',
 329         '44': '480x854',
 330         '45': '720x1280',
 331         '46': '1080x1920',
 332     }
 333     IE_NAME = u'youtube'
 334
 335     @classmethod
 336     def suitable(cls, url):
 337         """Receives a URL and returns True if suitable for this IE."""
 338         if YoutubePlaylistIE.suitable(url): return False
 339         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 340
 341     def report_lang(self):
 342         """Report attempt to set language."""
 343         self.to_screen(u'Setting language')
 344
 345     def report_login(self):
 346         """Report attempt to log in."""
 347         self.to_screen(u'Logging in')
 348
 349     def report_video_webpage_download(self, video_id):
 350         """Report attempt to download video webpage."""
 351         self.to_screen(u'%s: Downloading video webpage' % video_id)
 352
 353     def report_video_info_webpage_download(self, video_id):
 354         """Report attempt to download video info webpage."""
 355         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 356
 357     def report_video_subtitles_download(self, video_id):
 358         """Report attempt to download video info webpage."""
 359         self.to_screen(u'%s: Checking available subtitles' % video_id)
 360
 361     def report_video_subtitles_request(self, video_id, sub_lang, format):
 362         """Report attempt to download video info webpage."""
 363         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 364
 365     def report_video_subtitles_available(self, video_id, sub_lang_list):
 366         """Report available subtitles."""
 367         sub_lang = ",".join(list(sub_lang_list.keys()))
 368         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 369
 370     def report_information_extraction(self, video_id):
 371         """Report attempt to extract video information."""
 372         self.to_screen(u'%s: Extracting video information' % video_id)
 373
 374     def report_unavailable_format(self, video_id, format):
 375         """Report extracted video URL."""
 376         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 377
 378     def report_rtmp_download(self):
 379         """Indicate the download will use the RTMP protocol."""
 380         self.to_screen(u'RTMP download detected')
 381
 382     @staticmethod
 383     def _decrypt_signature(s):
 384         """Decrypt the key the two subkeys must have a length of 43"""
 385         (a,b) = s.split('.')
 386         if len(a) != 43 or len(b) != 43:
 387             raise ExtractorError(u'Unable to decrypt signature, subkeys lengths not valid')
 388         b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40]
 389         a = a[-40:]
 390         s_dec = '.'.join((a,b))[::-1]
 391         return s_dec
 392
 393     def _get_available_subtitles(self, video_id):
 394         self.report_video_subtitles_download(video_id)
 395         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 396         try:
 397             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 398         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 399             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 400         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 401         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 402         if not sub_lang_list:
 403             return (u'video doesn\'t have subtitles', None)
 404         return sub_lang_list
 405
 406     def _list_available_subtitles(self, video_id):
 407         sub_lang_list = self._get_available_subtitles(video_id)
 408         self.report_video_subtitles_available(video_id, sub_lang_list)
 409
 410     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 411         """
 412         Return tuple:
 413         (error_message, sub_lang, sub)
 414         """
 415         self.report_video_subtitles_request(video_id, sub_lang, format)
 416         params = compat_urllib_parse.urlencode({
 417             'lang': sub_lang,
 418             'name': sub_name,
 419             'v': video_id,
 420             'fmt': format,
 421         })
 422         url = 'http://www.youtube.com/api/timedtext?' + params
 423         try:
 424             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 425         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 426             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 427         if not sub:
 428             return (u'Did not fetch video subtitles', None, None)
 429         return (None, sub_lang, sub)
 430
 431     def _request_automatic_caption(self, video_id, webpage):
 432         """We need the webpage for getting the captions url, pass it as an
 433            argument to speed up the process."""
 434         sub_lang = self._downloader.params.get('subtitleslang')
 435         sub_format = self._downloader.params.get('subtitlesformat')
 436         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 437         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 438         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 439         if mobj is None:
 440             return [(err_msg, None, None)]
 441         player_config = json.loads(mobj.group(1))
 442         try:
 443             args = player_config[u'args']
 444             caption_url = args[u'ttsurl']
 445             timestamp = args[u'timestamp']
 446             params = compat_urllib_parse.urlencode({
 447                 'lang': 'en',
 448                 'tlang': sub_lang,
 449                 'fmt': sub_format,
 450                 'ts': timestamp,
 451                 'kind': 'asr',
 452             })
 453             subtitles_url = caption_url + '&' + params
 454             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 455             return [(None, sub_lang, sub)]
 456         except KeyError:
 457             return [(err_msg, None, None)]
 458
 459     def _extract_subtitle(self, video_id):
 460         """
 461         Return a list with a tuple:
 462         [(error_message, sub_lang, sub)]
 463         """
 464         sub_lang_list = self._get_available_subtitles(video_id)
 465         sub_format = self._downloader.params.get('subtitlesformat')
 466         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 467             return [(sub_lang_list[0], None, None)]
 468         if self._downloader.params.get('subtitleslang', False):
 469             sub_lang = self._downloader.params.get('subtitleslang')
 470         elif 'en' in sub_lang_list:
 471             sub_lang = 'en'
 472         else:
 473             sub_lang = list(sub_lang_list.keys())[0]
 474         if not sub_lang in sub_lang_list:
 475             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 476
 477         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 478         return [subtitle]
 479
 480     def _extract_all_subtitles(self, video_id):
 481         sub_lang_list = self._get_available_subtitles(video_id)
 482         sub_format = self._downloader.params.get('subtitlesformat')
 483         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 484             return [(sub_lang_list[0], None, None)]
 485         subtitles = []
 486         for sub_lang in sub_lang_list:
 487             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 488             subtitles.append(subtitle)
 489         return subtitles
 490
 491     def _print_formats(self, formats):
 492         print('Available formats:')
 493         for x in formats:
 494             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 495
 496     def _real_initialize(self):
 497         if self._downloader is None:
 498             return
 499
 500         username = None
 501         password = None
 502         downloader_params = self._downloader.params
 503
 504         # Attempt to use provided username and password or .netrc data
 505         if downloader_params.get('username', None) is not None:
 506             username = downloader_params['username']
 507             password = downloader_params['password']
 508         elif downloader_params.get('usenetrc', False):
 509             try:
 510                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 511                 if info is not None:
 512                     username = info[0]
 513                     password = info[2]
 514                 else:
 515                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 516             except (IOError, netrc.NetrcParseError) as err:
 517                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 518                 return
 519
 520         # Set language
 521         request = compat_urllib_request.Request(self._LANG_URL)
 522         try:
 523             self.report_lang()
 524             compat_urllib_request.urlopen(request).read()
 525         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 526             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 527             return
 528
 529         # No authentication to be performed
 530         if username is None:
 531             return
 532
 533         request = compat_urllib_request.Request(self._LOGIN_URL)
 534         try:
 535             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 536         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 537             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 538             return
 539
 540         galx = None
 541         dsh = None
 542         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 543         if match:
 544           galx = match.group(1)
 545
 546         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 547         if match:
 548           dsh = match.group(1)
 549
 550         # Log in
 551         login_form_strs = {
 552                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 553                 u'Email': username,
 554                 u'GALX': galx,
 555                 u'Passwd': password,
 556                 u'PersistentCookie': u'yes',
 557                 u'_utf8': u'霱',
 558                 u'bgresponse': u'js_disabled',
 559                 u'checkConnection': u'',
 560                 u'checkedDomains': u'youtube',
 561                 u'dnConn': u'',
 562                 u'dsh': dsh,
 563                 u'pstMsg': u'0',
 564                 u'rmShown': u'1',
 565                 u'secTok': u'',
 566                 u'signIn': u'Sign in',
 567                 u'timeStmp': u'',
 568                 u'service': u'youtube',
 569                 u'uilel': u'3',
 570                 u'hl': u'en_US',
 571         }
 572         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 573         # chokes on unicode
 574         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 575         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 576         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 577         try:
 578             self.report_login()
 579             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 580             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 581                 self._downloader.report_warning(u'unable to log in: bad username or password')
 582                 return
 583         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 584             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 585             return
 586
 587         # Confirm age
 588         age_form = {
 589                 'next_url':     '/',
 590                 'action_confirm':   'Confirm',
 591                 }
 592         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 593         try:
 594             self.report_age_confirmation()
 595             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 596         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 597             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 598
 599     def _extract_id(self, url):
 600         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 601         if mobj is None:
 602             raise ExtractorError(u'Invalid URL: %s' % url)
 603         video_id = mobj.group(2)
 604         return video_id
 605
 606     def _real_extract(self, url):
 607         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 608         mobj = re.search(self._NEXT_URL_RE, url)
 609         if mobj:
 610             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 611         video_id = self._extract_id(url)
 612
 613         # Get video webpage
 614         self.report_video_webpage_download(video_id)
 615         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 616         request = compat_urllib_request.Request(url)
 617         try:
 618             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 619         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 620             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 621
 622         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 623
 624         # Attempt to extract SWF player URL
 625         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 626         if mobj is not None:
 627             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 628         else:
 629             player_url = None
 630
 631         # Get video info
 632         self.report_video_info_webpage_download(video_id)
 633         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 634             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 635                     % (video_id, el_type))
 636             video_info_webpage = self._download_webpage(video_info_url, video_id,
 637                                     note=False,
 638                                     errnote='unable to download video info webpage')
 639             video_info = compat_parse_qs(video_info_webpage)
 640             if 'token' in video_info:
 641                 break
 642         if 'token' not in video_info:
 643             if 'reason' in video_info:
 644                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 645             else:
 646                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 647
 648         # Check for "rental" videos
 649         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 650             raise ExtractorError(u'"rental" videos not supported')
 651
 652         # Start extracting information
 653         self.report_information_extraction(video_id)
 654
 655         # uploader
 656         if 'author' not in video_info:
 657             raise ExtractorError(u'Unable to extract uploader name')
 658         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 659
 660         # uploader_id
 661         video_uploader_id = None
 662         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 663         if mobj is not None:
 664             video_uploader_id = mobj.group(1)
 665         else:
 666             self._downloader.report_warning(u'unable to extract uploader nickname')
 667
 668         # title
 669         if 'title' not in video_info:
 670             raise ExtractorError(u'Unable to extract video title')
 671         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 672
 673         # thumbnail image
 674         if 'thumbnail_url' not in video_info:
 675             self._downloader.report_warning(u'unable to extract video thumbnail')
 676             video_thumbnail = ''
 677         else:   # don't panic if we can't find it
 678             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 679
 680         # upload date
 681         upload_date = None
 682         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 683         if mobj is not None:
 684             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 685             upload_date = unified_strdate(upload_date)
 686
 687         # description
 688         video_description = get_element_by_id("eow-description", video_webpage)
 689         if video_description:
 690             video_description = clean_html(video_description)
 691         else:
 692             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 693             if fd_mobj:
 694                 video_description = unescapeHTML(fd_mobj.group(1))
 695             else:
 696                 video_description = u''
 697
 698         # subtitles
 699         video_subtitles = None
 700
 701         if self._downloader.params.get('writesubtitles', False):
 702             video_subtitles = self._extract_subtitle(video_id)
 703             if video_subtitles:
 704                 (sub_error, sub_lang, sub) = video_subtitles[0]
 705                 if sub_error:
 706                     # We try with the automatic captions
 707                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 708                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 709                     if sub is not None:
 710                         pass
 711                     else:
 712                         # We report the original error
 713                         self._downloader.report_error(sub_error)
 714
 715         if self._downloader.params.get('allsubtitles', False):
 716             video_subtitles = self._extract_all_subtitles(video_id)
 717             for video_subtitle in video_subtitles:
 718                 (sub_error, sub_lang, sub) = video_subtitle
 719                 if sub_error:
 720                     self._downloader.report_error(sub_error)
 721
 722         if self._downloader.params.get('listsubtitles', False):
 723             sub_lang_list = self._list_available_subtitles(video_id)
 724             return
 725
 726         if 'length_seconds' not in video_info:
 727             self._downloader.report_warning(u'unable to extract video duration')
 728             video_duration = ''
 729         else:
 730             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 731
 732         # token
 733         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 734
 735         # Decide which formats to download
 736         req_format = self._downloader.params.get('format', None)
 737
 738         try:
 739             mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
 740             info = json.loads(mobj.group(1))
 741             args = info['args']
 742             if args.get('ptk','') == 'vevo' or 'dashmpd':
 743                 # Vevo videos with encrypted signatures
 744                 self.to_screen(u'Vevo video detected.')
 745                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
 746         except ValueError:
 747             pass
 748
 749         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 750             self.report_rtmp_download()
 751             video_url_list = [(None, video_info['conn'][0])]
 752         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 753             url_map = {}
 754             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 755                 url_data = compat_parse_qs(url_data_str)
 756                 if 'itag' in url_data and 'url' in url_data:
 757                     url = url_data['url'][0]
 758                     if 'sig' in url_data:
 759                         url += '&signature=' + url_data['sig'][0]
 760                     if 's' in url_data:
 761                         signature = self._decrypt_signature(url_data['s'][0])
 762                         url += '&signature=' + signature
 763                     if 'ratebypass' not in url:
 764                         url += '&ratebypass=yes'
 765                     url_map[url_data['itag'][0]] = url
 766
 767             format_limit = self._downloader.params.get('format_limit', None)
 768             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 769             if format_limit is not None and format_limit in available_formats:
 770                 format_list = available_formats[available_formats.index(format_limit):]
 771             else:
 772                 format_list = available_formats
 773             existing_formats = [x for x in format_list if x in url_map]
 774             if len(existing_formats) == 0:
 775                 raise ExtractorError(u'no known formats available for video')
 776             if self._downloader.params.get('listformats', None):
 777                 self._print_formats(existing_formats)
 778                 return
 779             if req_format is None or req_format == 'best':
 780                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 781             elif req_format == 'worst':
 782                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 783             elif req_format in ('-1', 'all'):
 784                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 785             else:
 786                 # Specific formats. We pick the first in a slash-delimeted sequence.
 787                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 788                 req_formats = req_format.split('/')
 789                 video_url_list = None
 790                 for rf in req_formats:
 791                     if rf in url_map:
 792                         video_url_list = [(rf, url_map[rf])]
 793                         break
 794                 if video_url_list is None:
 795                     raise ExtractorError(u'requested format not available')
 796         else:
 797             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 798
 799         results = []
 800         for format_param, video_real_url in video_url_list:
 801             # Extension
 802             video_extension = self._video_extensions.get(format_param, 'flv')
 803
 804             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 805                                               self._video_dimensions.get(format_param, '???'))
 806
 807             results.append({
 808                 'id':       video_id,
 809                 'url':      video_real_url,
 810                 'uploader': video_uploader,
 811                 'uploader_id': video_uploader_id,
 812                 'upload_date':  upload_date,
 813                 'title':    video_title,
 814                 'ext':      video_extension,
 815                 'format':   video_format,
 816                 'thumbnail':    video_thumbnail,
 817                 'description':  video_description,
 818                 'player_url':   player_url,
 819                 'subtitles':    video_subtitles,
 820                 'duration':     video_duration
 821             })
 822         return results
 823
 824
 825 class MetacafeIE(InfoExtractor):
 826     """Information Extractor for metacafe.com."""
 827
 828     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 829     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 830     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 831     IE_NAME = u'metacafe'
 832
 833     def report_disclaimer(self):
 834         """Report disclaimer retrieval."""
 835         self.to_screen(u'Retrieving disclaimer')
 836
 837     def _real_initialize(self):
 838         # Retrieve disclaimer
 839         request = compat_urllib_request.Request(self._DISCLAIMER)
 840         try:
 841             self.report_disclaimer()
 842             disclaimer = compat_urllib_request.urlopen(request).read()
 843         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 844             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 845
 846         # Confirm age
 847         disclaimer_form = {
 848             'filters': '0',
 849             'submit': "Continue - I'm over 18",
 850             }
 851         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 852         try:
 853             self.report_age_confirmation()
 854             disclaimer = compat_urllib_request.urlopen(request).read()
 855         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 856             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 857
 858     def _real_extract(self, url):
 859         # Extract id and simplified title from URL
 860         mobj = re.match(self._VALID_URL, url)
 861         if mobj is None:
 862             raise ExtractorError(u'Invalid URL: %s' % url)
 863
 864         video_id = mobj.group(1)
 865
 866         # Check if video comes from YouTube
 867         mobj2 = re.match(r'^yt-(.*)$', video_id)
 868         if mobj2 is not None:
 869             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 870
 871         # Retrieve video webpage to extract further information
 872         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 873
 874         # Extract URL, uploader and title from webpage
 875         self.report_extraction(video_id)
 876         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 877         if mobj is not None:
 878             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 879             video_extension = mediaURL[-3:]
 880
 881             # Extract gdaKey if available
 882             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 883             if mobj is None:
 884                 video_url = mediaURL
 885             else:
 886                 gdaKey = mobj.group(1)
 887                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 888         else:
 889             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 890             if mobj is None:
 891                 raise ExtractorError(u'Unable to extract media URL')
 892             vardict = compat_parse_qs(mobj.group(1))
 893             if 'mediaData' not in vardict:
 894                 raise ExtractorError(u'Unable to extract media URL')
 895             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 896             if mobj is None:
 897                 raise ExtractorError(u'Unable to extract media URL')
 898             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 899             video_extension = mediaURL[-3:]
 900             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 901
 902         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 903         if mobj is None:
 904             raise ExtractorError(u'Unable to extract title')
 905         video_title = mobj.group(1).decode('utf-8')
 906
 907         mobj = re.search(r'submitter=(.*?);', webpage)
 908         if mobj is None:
 909             raise ExtractorError(u'Unable to extract uploader nickname')
 910         video_uploader = mobj.group(1)
 911
 912         return [{
 913             'id':       video_id.decode('utf-8'),
 914             'url':      video_url.decode('utf-8'),
 915             'uploader': video_uploader.decode('utf-8'),
 916             'upload_date':  None,
 917             'title':    video_title,
 918             'ext':      video_extension.decode('utf-8'),
 919         }]
 920
 921 class DailymotionIE(InfoExtractor):
 922     """Information Extractor for Dailymotion"""
 923
 924     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 925     IE_NAME = u'dailymotion'
 926
 927     def _real_extract(self, url):
 928         # Extract id and simplified title from URL
 929         mobj = re.match(self._VALID_URL, url)
 930         if mobj is None:
 931             raise ExtractorError(u'Invalid URL: %s' % url)
 932
 933         video_id = mobj.group(1).split('_')[0].split('?')[0]
 934
 935         video_extension = 'mp4'
 936
 937         # Retrieve video webpage to extract further information
 938         request = compat_urllib_request.Request(url)
 939         request.add_header('Cookie', 'family_filter=off')
 940         webpage = self._download_webpage(request, video_id)
 941
 942         # Extract URL, uploader and title from webpage
 943         self.report_extraction(video_id)
 944         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 945         if mobj is None:
 946             raise ExtractorError(u'Unable to extract media URL')
 947         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 948
 949         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 950             if key in flashvars:
 951                 max_quality = key
 952                 self.to_screen(u'Using %s' % key)
 953                 break
 954         else:
 955             raise ExtractorError(u'Unable to extract video URL')
 956
 957         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 958         if mobj is None:
 959             raise ExtractorError(u'Unable to extract video URL')
 960
 961         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 962
 963         # TODO: support choosing qualities
 964
 965         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 966         if mobj is None:
 967             raise ExtractorError(u'Unable to extract title')
 968         video_title = unescapeHTML(mobj.group('title'))
 969
 970         video_uploader = None
 971         video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
 972                                              # Looking for official user
 973                                              r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
 974                                             webpage, 'video uploader')
 975
 976         video_upload_date = None
 977         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 978         if mobj is not None:
 979             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 980
 981         return [{
 982             'id':       video_id,
 983             'url':      video_url,
 984             'uploader': video_uploader,
 985             'upload_date':  video_upload_date,
 986             'title':    video_title,
 987             'ext':      video_extension,
 988         }]
 989
 990
 991 class PhotobucketIE(InfoExtractor):
 992     """Information extractor for photobucket.com."""
 993
 994     # TODO: the original _VALID_URL was:
 995     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 996     # Check if it's necessary to keep the old extracion process
 997     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 998     IE_NAME = u'photobucket'
 999
1000     def _real_extract(self, url):
1001         # Extract id from URL
1002         mobj = re.match(self._VALID_URL, url)
1003         if mobj is None:
1004             raise ExtractorError(u'Invalid URL: %s' % url)
1005
1006         video_id = mobj.group('id')
1007
1008         video_extension = mobj.group('ext')
1009
1010         # Retrieve video webpage to extract further information
1011         webpage = self._download_webpage(url, video_id)
1012
1013         # Extract URL, uploader, and title from webpage
1014         self.report_extraction(video_id)
1015         # We try first by looking the javascript code:
1016         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
1017         if mobj is not None:
1018             info = json.loads(mobj.group('json'))
1019             return [{
1020                 'id':       video_id,
1021                 'url':      info[u'downloadUrl'],
1022                 'uploader': info[u'username'],
1023                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
1024                 'title':    info[u'title'],
1025                 'ext':      video_extension,
1026                 'thumbnail': info[u'thumbUrl'],
1027             }]
1028
1029         # We try looking in other parts of the webpage
1030         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1031             webpage, u'video URL')
1032
1033         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1034         if mobj is None:
1035             raise ExtractorError(u'Unable to extract title')
1036         video_title = mobj.group(1).decode('utf-8')
1037         video_uploader = mobj.group(2).decode('utf-8')
1038
1039         return [{
1040             'id':       video_id.decode('utf-8'),
1041             'url':      video_url.decode('utf-8'),
1042             'uploader': video_uploader,
1043             'upload_date':  None,
1044             'title':    video_title,
1045             'ext':      video_extension.decode('utf-8'),
1046         }]
1047
1048
1049 class YahooIE(InfoExtractor):
1050     """Information extractor for screen.yahoo.com."""
1051     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1052
1053     def _real_extract(self, url):
1054         mobj = re.match(self._VALID_URL, url)
1055         if mobj is None:
1056             raise ExtractorError(u'Invalid URL: %s' % url)
1057         video_id = mobj.group('id')
1058         webpage = self._download_webpage(url, video_id)
1059         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1060
1061         if m_id is None:
1062             # TODO: Check which url parameters are required
1063             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1064             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1065             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1066                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1067                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1068                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1069                         '''
1070             self.report_extraction(video_id)
1071             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1072             if m_info is None:
1073                 raise ExtractorError(u'Unable to extract video info')
1074             video_title = m_info.group('title')
1075             video_description = m_info.group('description')
1076             video_thumb = m_info.group('thumb')
1077             video_date = m_info.group('date')
1078             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1079
1080             # TODO: Find a way to get mp4 videos
1081             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1082             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1083             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1084             video_url = m_rest.group('url')
1085             video_path = m_rest.group('path')
1086             if m_rest is None:
1087                 raise ExtractorError(u'Unable to extract video url')
1088
1089         else: # We have to use a different method if another id is defined
1090             long_id = m_id.group('new_id')
1091             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1092             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1093             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1094             info = json.loads(json_str)
1095             res = info[u'query'][u'results'][u'mediaObj'][0]
1096             stream = res[u'streams'][0]
1097             video_path = stream[u'path']
1098             video_url = stream[u'host']
1099             meta = res[u'meta']
1100             video_title = meta[u'title']
1101             video_description = meta[u'description']
1102             video_thumb = meta[u'thumbnail']
1103             video_date = None # I can't find it
1104
1105         info_dict = {
1106                      'id': video_id,
1107                      'url': video_url,
1108                      'play_path': video_path,
1109                      'title':video_title,
1110                      'description': video_description,
1111                      'thumbnail': video_thumb,
1112                      'upload_date': video_date,
1113                      'ext': 'flv',
1114                      }
1115         return info_dict
1116
1117 class VimeoIE(InfoExtractor):
1118     """Information extractor for vimeo.com."""
1119
1120     # _VALID_URL matches Vimeo URLs
1121     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1122     IE_NAME = u'vimeo'
1123
1124     def _real_extract(self, url, new_video=True):
1125         # Extract ID from URL
1126         mobj = re.match(self._VALID_URL, url)
1127         if mobj is None:
1128             raise ExtractorError(u'Invalid URL: %s' % url)
1129
1130         video_id = mobj.group('id')
1131         if not mobj.group('proto'):
1132             url = 'https://' + url
1133         if mobj.group('direct_link') or mobj.group('pro'):
1134             url = 'https://vimeo.com/' + video_id
1135
1136         # Retrieve video webpage to extract further information
1137         request = compat_urllib_request.Request(url, None, std_headers)
1138         webpage = self._download_webpage(request, video_id)
1139
1140         # Now we begin extracting as much information as we can from what we
1141         # retrieved. First we extract the information common to all extractors,
1142         # and latter we extract those that are Vimeo specific.
1143         self.report_extraction(video_id)
1144
1145         # Extract the config JSON
1146         try:
1147             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1148             config = json.loads(config)
1149         except:
1150             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1151                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1152             else:
1153                 raise ExtractorError(u'Unable to extract info section')
1154
1155         # Extract title
1156         video_title = config["video"]["title"]
1157
1158         # Extract uploader and uploader_id
1159         video_uploader = config["video"]["owner"]["name"]
1160         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1161
1162         # Extract video thumbnail
1163         video_thumbnail = config["video"]["thumbnail"]
1164
1165         # Extract video description
1166         video_description = get_element_by_attribute("itemprop", "description", webpage)
1167         if video_description: video_description = clean_html(video_description)
1168         else: video_description = u''
1169
1170         # Extract upload date
1171         video_upload_date = None
1172         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1173         if mobj is not None:
1174             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1175
1176         # Vimeo specific: extract request signature and timestamp
1177         sig = config['request']['signature']
1178         timestamp = config['request']['timestamp']
1179
1180         # Vimeo specific: extract video codec and quality information
1181         # First consider quality, then codecs, then take everything
1182         # TODO bind to format param
1183         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1184         files = { 'hd': [], 'sd': [], 'other': []}
1185         for codec_name, codec_extension in codecs:
1186             if codec_name in config["video"]["files"]:
1187                 if 'hd' in config["video"]["files"][codec_name]:
1188                     files['hd'].append((codec_name, codec_extension, 'hd'))
1189                 elif 'sd' in config["video"]["files"][codec_name]:
1190                     files['sd'].append((codec_name, codec_extension, 'sd'))
1191                 else:
1192                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1193
1194         for quality in ('hd', 'sd', 'other'):
1195             if len(files[quality]) > 0:
1196                 video_quality = files[quality][0][2]
1197                 video_codec = files[quality][0][0]
1198                 video_extension = files[quality][0][1]
1199                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1200                 break
1201         else:
1202             raise ExtractorError(u'No known codec found')
1203
1204         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1205                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1206
1207         return [{
1208             'id':       video_id,
1209             'url':      video_url,
1210             'uploader': video_uploader,
1211             'uploader_id': video_uploader_id,
1212             'upload_date':  video_upload_date,
1213             'title':    video_title,
1214             'ext':      video_extension,
1215             'thumbnail':    video_thumbnail,
1216             'description':  video_description,
1217         }]
1218
1219
1220 class ArteTvIE(InfoExtractor):
1221     """arte.tv information extractor."""
1222
1223     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1224     _LIVE_URL = r'index-[0-9]+\.html$'
1225
1226     IE_NAME = u'arte.tv'
1227
1228     def fetch_webpage(self, url):
1229         request = compat_urllib_request.Request(url)
1230         try:
1231             self.report_download_webpage(url)
1232             webpage = compat_urllib_request.urlopen(request).read()
1233         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1234             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1235         except ValueError as err:
1236             raise ExtractorError(u'Invalid URL: %s' % url)
1237         return webpage
1238
1239     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1240         page = self.fetch_webpage(url)
1241         mobj = re.search(regex, page, regexFlags)
1242         info = {}
1243
1244         if mobj is None:
1245             raise ExtractorError(u'Invalid URL: %s' % url)
1246
1247         for (i, key, err) in matchTuples:
1248             if mobj.group(i) is None:
1249                 raise ExtractorError(err)
1250             else:
1251                 info[key] = mobj.group(i)
1252
1253         return info
1254
1255     def extractLiveStream(self, url):
1256         video_lang = url.split('/')[-4]
1257         info = self.grep_webpage(
1258             url,
1259             r'src="(.*?/videothek_js.*?\.js)',
1260             0,
1261             [
1262                 (1, 'url', u'Invalid URL: %s' % url)
1263             ]
1264         )
1265         http_host = url.split('/')[2]
1266         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1267         info = self.grep_webpage(
1268             next_url,
1269             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1270                 '(http://.*?\.swf).*?' +
1271                 '(rtmp://.*?)\'',
1272             re.DOTALL,
1273             [
1274                 (1, 'path',   u'could not extract video path: %s' % url),
1275                 (2, 'player', u'could not extract video player: %s' % url),
1276                 (3, 'url',    u'could not extract video url: %s' % url)
1277             ]
1278         )
1279         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1280
1281     def extractPlus7Stream(self, url):
1282         video_lang = url.split('/')[-3]
1283         info = self.grep_webpage(
1284             url,
1285             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1286             0,
1287             [
1288                 (1, 'url', u'Invalid URL: %s' % url)
1289             ]
1290         )
1291         next_url = compat_urllib_parse.unquote(info.get('url'))
1292         info = self.grep_webpage(
1293             next_url,
1294             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1295             0,
1296             [
1297                 (1, 'url', u'Could not find <video> tag: %s' % url)
1298             ]
1299         )
1300         next_url = compat_urllib_parse.unquote(info.get('url'))
1301
1302         info = self.grep_webpage(
1303             next_url,
1304             r'<video id="(.*?)".*?>.*?' +
1305                 '<name>(.*?)</name>.*?' +
1306                 '<dateVideo>(.*?)</dateVideo>.*?' +
1307                 '<url quality="hd">(.*?)</url>',
1308             re.DOTALL,
1309             [
1310                 (1, 'id',    u'could not extract video id: %s' % url),
1311                 (2, 'title', u'could not extract video title: %s' % url),
1312                 (3, 'date',  u'could not extract video date: %s' % url),
1313                 (4, 'url',   u'could not extract video url: %s' % url)
1314             ]
1315         )
1316
1317         return {
1318             'id':           info.get('id'),
1319             'url':          compat_urllib_parse.unquote(info.get('url')),
1320             'uploader':     u'arte.tv',
1321             'upload_date':  unified_strdate(info.get('date')),
1322             'title':        info.get('title').decode('utf-8'),
1323             'ext':          u'mp4',
1324             'format':       u'NA',
1325             'player_url':   None,
1326         }
1327
1328     def _real_extract(self, url):
1329         video_id = url.split('/')[-1]
1330         self.report_extraction(video_id)
1331
1332         if re.search(self._LIVE_URL, video_id) is not None:
1333             self.extractLiveStream(url)
1334             return
1335         else:
1336             info = self.extractPlus7Stream(url)
1337
1338         return [info]
1339
1340
1341 class GenericIE(InfoExtractor):
1342     """Generic last-resort information extractor."""
1343
1344     _VALID_URL = r'.*'
1345     IE_NAME = u'generic'
1346
1347     def report_download_webpage(self, video_id):
1348         """Report webpage download."""
1349         if not self._downloader.params.get('test', False):
1350             self._downloader.report_warning(u'Falling back on generic information extractor.')
1351         super(GenericIE, self).report_download_webpage(video_id)
1352
1353     def report_following_redirect(self, new_url):
1354         """Report information extraction."""
1355         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1356
1357     def _test_redirect(self, url):
1358         """Check if it is a redirect, like url shorteners, in case return the new url."""
1359         class HeadRequest(compat_urllib_request.Request):
1360             def get_method(self):
1361                 return "HEAD"
1362
1363         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1364             """
1365             Subclass the HTTPRedirectHandler to make it use our
1366             HeadRequest also on the redirected URL
1367             """
1368             def redirect_request(self, req, fp, code, msg, headers, newurl):
1369                 if code in (301, 302, 303, 307):
1370                     newurl = newurl.replace(' ', '%20')
1371                     newheaders = dict((k,v) for k,v in req.headers.items()
1372                                       if k.lower() not in ("content-length", "content-type"))
1373                     return HeadRequest(newurl,
1374                                        headers=newheaders,
1375                                        origin_req_host=req.get_origin_req_host(),
1376                                        unverifiable=True)
1377                 else:
1378                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1379
1380         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1381             """
1382             Fallback to GET if HEAD is not allowed (405 HTTP error)
1383             """
1384             def http_error_405(self, req, fp, code, msg, headers):
1385                 fp.read()
1386                 fp.close()
1387
1388                 newheaders = dict((k,v) for k,v in req.headers.items()
1389                                   if k.lower() not in ("content-length", "content-type"))
1390                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1391                                                  headers=newheaders,
1392                                                  origin_req_host=req.get_origin_req_host(),
1393                                                  unverifiable=True))
1394
1395         # Build our opener
1396         opener = compat_urllib_request.OpenerDirector()
1397         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1398                         HTTPMethodFallback, HEADRedirectHandler,
1399                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1400             opener.add_handler(handler())
1401
1402         response = opener.open(HeadRequest(url))
1403         if response is None:
1404             raise ExtractorError(u'Invalid URL protocol')
1405         new_url = response.geturl()
1406
1407         if url == new_url:
1408             return False
1409
1410         self.report_following_redirect(new_url)
1411         return new_url
1412
1413     def _real_extract(self, url):
1414         new_url = self._test_redirect(url)
1415         if new_url: return [self.url_result(new_url)]
1416
1417         video_id = url.split('/')[-1]
1418         try:
1419             webpage = self._download_webpage(url, video_id)
1420         except ValueError as err:
1421             # since this is the last-resort InfoExtractor, if
1422             # this error is thrown, it'll be thrown here
1423             raise ExtractorError(u'Invalid URL: %s' % url)
1424
1425         self.report_extraction(video_id)
1426         # Start with something easy: JW Player in SWFObject
1427         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1428         if mobj is None:
1429             # Broaden the search a little bit
1430             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1431         if mobj is None:
1432             # Broaden the search a little bit: JWPlayer JS loader
1433             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1434         if mobj is None:
1435             # Try to find twitter cards info
1436             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1437         if mobj is None:
1438             raise ExtractorError(u'Invalid URL: %s' % url)
1439
1440         # It's possible that one of the regexes
1441         # matched, but returned an empty group:
1442         if mobj.group(1) is None:
1443             raise ExtractorError(u'Invalid URL: %s' % url)
1444
1445         video_url = compat_urllib_parse.unquote(mobj.group(1))
1446         video_id = os.path.basename(video_url)
1447
1448         # here's a fun little line of code for you:
1449         video_extension = os.path.splitext(video_id)[1][1:]
1450         video_id = os.path.splitext(video_id)[0]
1451
1452         # it's tempting to parse this further, but you would
1453         # have to take into account all the variations like
1454         #   Video Title - Site Name
1455         #   Site Name | Video Title
1456         #   Video Title - Tagline | Site Name
1457         # and so on and so forth; it's just not practical
1458         video_title = self._html_search_regex(r'<title>(.*)</title>',
1459             webpage, u'video title')
1460
1461         # video uploader is domain name
1462         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1463             url, u'video uploader')
1464
1465         return [{
1466             'id':       video_id,
1467             'url':      video_url,
1468             'uploader': video_uploader,
1469             'upload_date':  None,
1470             'title':    video_title,
1471             'ext':      video_extension,
1472         }]
1473
1474
1475 class YoutubeSearchIE(SearchInfoExtractor):
1476     """Information Extractor for YouTube search queries."""
1477     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1478     _MAX_RESULTS = 1000
1479     IE_NAME = u'youtube:search'
1480     _SEARCH_KEY = 'ytsearch'
1481
1482     def report_download_page(self, query, pagenum):
1483         """Report attempt to download search page with given number."""
1484         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1485
1486     def _get_n_results(self, query, n):
1487         """Get a specified number of results for a query"""
1488
1489         video_ids = []
1490         pagenum = 0
1491         limit = n
1492
1493         while (50 * pagenum) < limit:
1494             self.report_download_page(query, pagenum+1)
1495             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1496             request = compat_urllib_request.Request(result_url)
1497             try:
1498                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1499             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1500                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1501             api_response = json.loads(data)['data']
1502
1503             if not 'items' in api_response:
1504                 raise ExtractorError(u'[youtube] No video results')
1505
1506             new_ids = list(video['id'] for video in api_response['items'])
1507             video_ids += new_ids
1508
1509             limit = min(n, api_response['totalItems'])
1510             pagenum += 1
1511
1512         if len(video_ids) > n:
1513             video_ids = video_ids[:n]
1514         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1515         return self.playlist_result(videos, query)
1516
1517
1518 class GoogleSearchIE(SearchInfoExtractor):
1519     """Information Extractor for Google Video search queries."""
1520     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1521     _MAX_RESULTS = 1000
1522     IE_NAME = u'video.google:search'
1523     _SEARCH_KEY = 'gvsearch'
1524
1525     def _get_n_results(self, query, n):
1526         """Get a specified number of results for a query"""
1527
1528         res = {
1529             '_type': 'playlist',
1530             'id': query,
1531             'entries': []
1532         }
1533
1534         for pagenum in itertools.count(1):
1535             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1536             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1537                                              note='Downloading result page ' + str(pagenum))
1538
1539             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1540                 e = {
1541                     '_type': 'url',
1542                     'url': mobj.group(1)
1543                 }
1544                 res['entries'].append(e)
1545
1546             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1547                 return res
1548
1549 class YahooSearchIE(SearchInfoExtractor):
1550     """Information Extractor for Yahoo! Video search queries."""
1551
1552     _MAX_RESULTS = 1000
1553     IE_NAME = u'screen.yahoo:search'
1554     _SEARCH_KEY = 'yvsearch'
1555
1556     def _get_n_results(self, query, n):
1557         """Get a specified number of results for a query"""
1558
1559         res = {
1560             '_type': 'playlist',
1561             'id': query,
1562             'entries': []
1563         }
1564         for pagenum in itertools.count(0):
1565             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1566             webpage = self._download_webpage(result_url, query,
1567                                              note='Downloading results page '+str(pagenum+1))
1568             info = json.loads(webpage)
1569             m = info[u'm']
1570             results = info[u'results']
1571
1572             for (i, r) in enumerate(results):
1573                 if (pagenum * 30) +i >= n:
1574                     break
1575                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1576                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1577                 res['entries'].append(e)
1578             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1579                 break
1580
1581         return res
1582
1583
1584 class YoutubePlaylistIE(InfoExtractor):
1585     """Information Extractor for YouTube playlists."""
1586
1587     _VALID_URL = r"""(?:
1588                         (?:https?://)?
1589                         (?:\w+\.)?
1590                         youtube\.com/
1591                         (?:
1592                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1593                            \? (?:.*?&)*? (?:p|a|list)=
1594                         |  p/
1595                         )
1596                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1597                         .*
1598                      |
1599                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1600                      )"""
1601     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1602     _MAX_RESULTS = 50
1603     IE_NAME = u'youtube:playlist'
1604
1605     @classmethod
1606     def suitable(cls, url):
1607         """Receives a URL and returns True if suitable for this IE."""
1608         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1609
1610     def _real_extract(self, url):
1611         # Extract playlist id
1612         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1613         if mobj is None:
1614             raise ExtractorError(u'Invalid URL: %s' % url)
1615
1616         # Download playlist videos from API
1617         playlist_id = mobj.group(1) or mobj.group(2)
1618         page_num = 1
1619         videos = []
1620
1621         while True:
1622             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1623             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1624
1625             try:
1626                 response = json.loads(page)
1627             except ValueError as err:
1628                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1629
1630             if 'feed' not in response:
1631                 raise ExtractorError(u'Got a malformed response from YouTube API')
1632             playlist_title = response['feed']['title']['$t']
1633             if 'entry' not in response['feed']:
1634                 # Number of videos is a multiple of self._MAX_RESULTS
1635                 break
1636
1637             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1638                         for entry in response['feed']['entry']
1639                         if 'content' in entry ]
1640
1641             if len(response['feed']['entry']) < self._MAX_RESULTS:
1642                 break
1643             page_num += 1
1644
1645         videos = [v[1] for v in sorted(videos)]
1646
1647         url_results = [self.url_result(url, 'Youtube') for url in videos]
1648         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1649
1650
1651 class YoutubeChannelIE(InfoExtractor):
1652     """Information Extractor for YouTube channels."""
1653
1654     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1655     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1656     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1657     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1658     IE_NAME = u'youtube:channel'
1659
1660     def extract_videos_from_page(self, page):
1661         ids_in_page = []
1662         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1663             if mobj.group(1) not in ids_in_page:
1664                 ids_in_page.append(mobj.group(1))
1665         return ids_in_page
1666
1667     def _real_extract(self, url):
1668         # Extract channel id
1669         mobj = re.match(self._VALID_URL, url)
1670         if mobj is None:
1671             raise ExtractorError(u'Invalid URL: %s' % url)
1672
1673         # Download channel page
1674         channel_id = mobj.group(1)
1675         video_ids = []
1676         pagenum = 1
1677
1678         url = self._TEMPLATE_URL % (channel_id, pagenum)
1679         page = self._download_webpage(url, channel_id,
1680                                       u'Downloading page #%s' % pagenum)
1681
1682         # Extract video identifiers
1683         ids_in_page = self.extract_videos_from_page(page)
1684         video_ids.extend(ids_in_page)
1685
1686         # Download any subsequent channel pages using the json-based channel_ajax query
1687         if self._MORE_PAGES_INDICATOR in page:
1688             while True:
1689                 pagenum = pagenum + 1
1690
1691                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1692                 page = self._download_webpage(url, channel_id,
1693                                               u'Downloading page #%s' % pagenum)
1694
1695                 page = json.loads(page)
1696
1697                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1698                 video_ids.extend(ids_in_page)
1699
1700                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1701                     break
1702
1703         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1704
1705         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1706         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1707         return [self.playlist_result(url_entries, channel_id)]
1708
1709
1710 class YoutubeUserIE(InfoExtractor):
1711     """Information Extractor for YouTube users."""
1712
1713     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1714     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1715     _GDATA_PAGE_SIZE = 50
1716     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1717     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1718     IE_NAME = u'youtube:user'
1719
1720     def _real_extract(self, url):
1721         # Extract username
1722         mobj = re.match(self._VALID_URL, url)
1723         if mobj is None:
1724             raise ExtractorError(u'Invalid URL: %s' % url)
1725
1726         username = mobj.group(1)
1727
1728         # Download video ids using YouTube Data API. Result size per
1729         # query is limited (currently to 50 videos) so we need to query
1730         # page by page until there are no video ids - it means we got
1731         # all of them.
1732
1733         video_ids = []
1734         pagenum = 0
1735
1736         while True:
1737             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1738
1739             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1740             page = self._download_webpage(gdata_url, username,
1741                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1742
1743             # Extract video identifiers
1744             ids_in_page = []
1745
1746             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1747                 if mobj.group(1) not in ids_in_page:
1748                     ids_in_page.append(mobj.group(1))
1749
1750             video_ids.extend(ids_in_page)
1751
1752             # A little optimization - if current page is not
1753             # "full", ie. does not contain PAGE_SIZE video ids then
1754             # we can assume that this page is the last one - there
1755             # are no more ids on further pages - no need to query
1756             # again.
1757
1758             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1759                 break
1760
1761             pagenum += 1
1762
1763         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1764         url_results = [self.url_result(url, 'Youtube') for url in urls]
1765         return [self.playlist_result(url_results, playlist_title = username)]
1766
1767
1768 class BlipTVUserIE(InfoExtractor):
1769     """Information Extractor for blip.tv users."""
1770
1771     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1772     _PAGE_SIZE = 12
1773     IE_NAME = u'blip.tv:user'
1774
1775     def _real_extract(self, url):
1776         # Extract username
1777         mobj = re.match(self._VALID_URL, url)
1778         if mobj is None:
1779             raise ExtractorError(u'Invalid URL: %s' % url)
1780
1781         username = mobj.group(1)
1782
1783         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1784
1785         page = self._download_webpage(url, username, u'Downloading user page')
1786         mobj = re.search(r'data-users-id="([^"]+)"', page)
1787         page_base = page_base % mobj.group(1)
1788
1789
1790         # Download video ids using BlipTV Ajax calls. Result size per
1791         # query is limited (currently to 12 videos) so we need to query
1792         # page by page until there are no video ids - it means we got
1793         # all of them.
1794
1795         video_ids = []
1796         pagenum = 1
1797
1798         while True:
1799             url = page_base + "&page=" + str(pagenum)
1800             page = self._download_webpage(url, username,
1801                                           u'Downloading video ids from page %d' % pagenum)
1802
1803             # Extract video identifiers
1804             ids_in_page = []
1805
1806             for mobj in re.finditer(r'href="/([^"]+)"', page):
1807                 if mobj.group(1) not in ids_in_page:
1808                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1809
1810             video_ids.extend(ids_in_page)
1811
1812             # A little optimization - if current page is not
1813             # "full", ie. does not contain PAGE_SIZE video ids then
1814             # we can assume that this page is the last one - there
1815             # are no more ids on further pages - no need to query
1816             # again.
1817
1818             if len(ids_in_page) < self._PAGE_SIZE:
1819                 break
1820
1821             pagenum += 1
1822
1823         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1824         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1825         return [self.playlist_result(url_entries, playlist_title = username)]
1826
1827
1828 class DepositFilesIE(InfoExtractor):
1829     """Information extractor for depositfiles.com"""
1830
1831     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1832
1833     def _real_extract(self, url):
1834         file_id = url.split('/')[-1]
1835         # Rebuild url in english locale
1836         url = 'http://depositfiles.com/en/files/' + file_id
1837
1838         # Retrieve file webpage with 'Free download' button pressed
1839         free_download_indication = { 'gateway_result' : '1' }
1840         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1841         try:
1842             self.report_download_webpage(file_id)
1843             webpage = compat_urllib_request.urlopen(request).read()
1844         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1845             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1846
1847         # Search for the real file URL
1848         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1849         if (mobj is None) or (mobj.group(1) is None):
1850             # Try to figure out reason of the error.
1851             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1852             if (mobj is not None) and (mobj.group(1) is not None):
1853                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1854                 raise ExtractorError(u'%s' % restriction_message)
1855             else:
1856                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1857
1858         file_url = mobj.group(1)
1859         file_extension = os.path.splitext(file_url)[1][1:]
1860
1861         # Search for file title
1862         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1863
1864         return [{
1865             'id':       file_id.decode('utf-8'),
1866             'url':      file_url.decode('utf-8'),
1867             'uploader': None,
1868             'upload_date':  None,
1869             'title':    file_title,
1870             'ext':      file_extension.decode('utf-8'),
1871         }]
1872
1873
1874 class FacebookIE(InfoExtractor):
1875     """Information Extractor for Facebook"""
1876
1877     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1878     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1879     _NETRC_MACHINE = 'facebook'
1880     IE_NAME = u'facebook'
1881
1882     def report_login(self):
1883         """Report attempt to log in."""
1884         self.to_screen(u'Logging in')
1885
1886     def _real_initialize(self):
1887         if self._downloader is None:
1888             return
1889
1890         useremail = None
1891         password = None
1892         downloader_params = self._downloader.params
1893
1894         # Attempt to use provided username and password or .netrc data
1895         if downloader_params.get('username', None) is not None:
1896             useremail = downloader_params['username']
1897             password = downloader_params['password']
1898         elif downloader_params.get('usenetrc', False):
1899             try:
1900                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1901                 if info is not None:
1902                     useremail = info[0]
1903                     password = info[2]
1904                 else:
1905                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1906             except (IOError, netrc.NetrcParseError) as err:
1907                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1908                 return
1909
1910         if useremail is None:
1911             return
1912
1913         # Log in
1914         login_form = {
1915             'email': useremail,
1916             'pass': password,
1917             'login': 'Log+In'
1918             }
1919         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1920         try:
1921             self.report_login()
1922             login_results = compat_urllib_request.urlopen(request).read()
1923             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1924                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1925                 return
1926         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1927             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1928             return
1929
1930     def _real_extract(self, url):
1931         mobj = re.match(self._VALID_URL, url)
1932         if mobj is None:
1933             raise ExtractorError(u'Invalid URL: %s' % url)
1934         video_id = mobj.group('ID')
1935
1936         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1937         webpage = self._download_webpage(url, video_id)
1938
1939         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1940         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1941         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1942         if not m:
1943             raise ExtractorError(u'Cannot parse data')
1944         data = dict(json.loads(m.group(1)))
1945         params_raw = compat_urllib_parse.unquote(data['params'])
1946         params = json.loads(params_raw)
1947         video_data = params['video_data'][0]
1948         video_url = video_data.get('hd_src')
1949         if not video_url:
1950             video_url = video_data['sd_src']
1951         if not video_url:
1952             raise ExtractorError(u'Cannot find video URL')
1953         video_duration = int(video_data['video_duration'])
1954         thumbnail = video_data['thumbnail_src']
1955
1956         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1957             webpage, u'title')
1958
1959         info = {
1960             'id': video_id,
1961             'title': video_title,
1962             'url': video_url,
1963             'ext': 'mp4',
1964             'duration': video_duration,
1965             'thumbnail': thumbnail,
1966         }
1967         return [info]
1968
1969
1970 class BlipTVIE(InfoExtractor):
1971     """Information extractor for blip.tv"""
1972
1973     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1974     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1975     IE_NAME = u'blip.tv'
1976
1977     def report_direct_download(self, title):
1978         """Report information extraction."""
1979         self.to_screen(u'%s: Direct download detected' % title)
1980
1981     def _real_extract(self, url):
1982         mobj = re.match(self._VALID_URL, url)
1983         if mobj is None:
1984             raise ExtractorError(u'Invalid URL: %s' % url)
1985
1986         # See https://github.com/rg3/youtube-dl/issues/857
1987         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1988         if api_mobj is not None:
1989             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1990         urlp = compat_urllib_parse_urlparse(url)
1991         if urlp.path.startswith('/play/'):
1992             request = compat_urllib_request.Request(url)
1993             response = compat_urllib_request.urlopen(request)
1994             redirecturl = response.geturl()
1995             rurlp = compat_urllib_parse_urlparse(redirecturl)
1996             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1997             url = 'http://blip.tv/a/a-' + file_id
1998             return self._real_extract(url)
1999
2000
2001         if '?' in url:
2002             cchar = '&'
2003         else:
2004             cchar = '?'
2005         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2006         request = compat_urllib_request.Request(json_url)
2007         request.add_header('User-Agent', 'iTunes/10.6.1')
2008         self.report_extraction(mobj.group(1))
2009         info = None
2010         try:
2011             urlh = compat_urllib_request.urlopen(request)
2012             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2013                 basename = url.split('/')[-1]
2014                 title,ext = os.path.splitext(basename)
2015                 title = title.decode('UTF-8')
2016                 ext = ext.replace('.', '')
2017                 self.report_direct_download(title)
2018                 info = {
2019                     'id': title,
2020                     'url': url,
2021                     'uploader': None,
2022                     'upload_date': None,
2023                     'title': title,
2024                     'ext': ext,
2025                     'urlhandle': urlh
2026                 }
2027         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2028             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2029         if info is None: # Regular URL
2030             try:
2031                 json_code_bytes = urlh.read()
2032                 json_code = json_code_bytes.decode('utf-8')
2033             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2034                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2035
2036             try:
2037                 json_data = json.loads(json_code)
2038                 if 'Post' in json_data:
2039                     data = json_data['Post']
2040                 else:
2041                     data = json_data
2042
2043                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2044                 video_url = data['media']['url']
2045                 umobj = re.match(self._URL_EXT, video_url)
2046                 if umobj is None:
2047                     raise ValueError('Can not determine filename extension')
2048                 ext = umobj.group(1)
2049
2050                 info = {
2051                     'id': data['item_id'],
2052                     'url': video_url,
2053                     'uploader': data['display_name'],
2054                     'upload_date': upload_date,
2055                     'title': data['title'],
2056                     'ext': ext,
2057                     'format': data['media']['mimeType'],
2058                     'thumbnail': data['thumbnailUrl'],
2059                     'description': data['description'],
2060                     'player_url': data['embedUrl'],
2061                     'user_agent': 'iTunes/10.6.1',
2062                 }
2063             except (ValueError,KeyError) as err:
2064                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2065
2066         return [info]
2067
2068
2069 class MyVideoIE(InfoExtractor):
2070     """Information Extractor for myvideo.de."""
2071
2072     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2073     IE_NAME = u'myvideo'
2074
2075     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2076     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2077     # https://github.com/rg3/youtube-dl/pull/842
2078     def __rc4crypt(self,data, key):
2079         x = 0
2080         box = list(range(256))
2081         for i in list(range(256)):
2082             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2083             box[i], box[x] = box[x], box[i]
2084         x = 0
2085         y = 0
2086         out = ''
2087         for char in data:
2088             x = (x + 1) % 256
2089             y = (y + box[x]) % 256
2090             box[x], box[y] = box[y], box[x]
2091             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2092         return out
2093
2094     def __md5(self,s):
2095         return hashlib.md5(s).hexdigest().encode()
2096
2097     def _real_extract(self,url):
2098         mobj = re.match(self._VALID_URL, url)
2099         if mobj is None:
2100             raise ExtractorError(u'invalid URL: %s' % url)
2101
2102         video_id = mobj.group(1)
2103
2104         GK = (
2105           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2106           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2107           b'TnpsbA0KTVRkbU1tSTRNdz09'
2108         )
2109
2110         # Get video webpage
2111         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2112         webpage = self._download_webpage(webpage_url, video_id)
2113
2114         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2115         if mobj is not None:
2116             self.report_extraction(video_id)
2117             video_url = mobj.group(1) + '.flv'
2118
2119             video_title = self._html_search_regex('<title>([^<]+)</title>',
2120                 webpage, u'title')
2121
2122             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2123
2124             return [{
2125                 'id':       video_id,
2126                 'url':      video_url,
2127                 'uploader': None,
2128                 'upload_date':  None,
2129                 'title':    video_title,
2130                 'ext':      u'flv',
2131             }]
2132
2133         # try encxml
2134         mobj = re.search('var flashvars={(.+?)}', webpage)
2135         if mobj is None:
2136             raise ExtractorError(u'Unable to extract video')
2137
2138         params = {}
2139         encxml = ''
2140         sec = mobj.group(1)
2141         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2142             if not a == '_encxml':
2143                 params[a] = b
2144             else:
2145                 encxml = compat_urllib_parse.unquote(b)
2146         if not params.get('domain'):
2147             params['domain'] = 'www.myvideo.de'
2148         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2149         if 'flash_playertype=MTV' in xmldata_url:
2150             self._downloader.report_warning(u'avoiding MTV player')
2151             xmldata_url = (
2152                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2153                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2154             ) % video_id
2155
2156         # get enc data
2157         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2158         enc_data_b = binascii.unhexlify(enc_data)
2159         sk = self.__md5(
2160             base64.b64decode(base64.b64decode(GK)) +
2161             self.__md5(
2162                 str(video_id).encode('utf-8')
2163             )
2164         )
2165         dec_data = self.__rc4crypt(enc_data_b, sk)
2166
2167         # extracting infos
2168         self.report_extraction(video_id)
2169
2170         video_url = None
2171         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2172         if mobj:
2173             video_url = compat_urllib_parse.unquote(mobj.group(1))
2174             if 'myvideo2flash' in video_url:
2175                 self._downloader.report_warning(u'forcing RTMPT ...')
2176                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2177
2178         if not video_url:
2179             # extract non rtmp videos
2180             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2181             if mobj is None:
2182                 raise ExtractorError(u'unable to extract url')
2183             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2184
2185         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2186         video_file = compat_urllib_parse.unquote(video_file)
2187
2188         if not video_file.endswith('f4m'):
2189             ppath, prefix = video_file.split('.')
2190             video_playpath = '%s:%s' % (prefix, ppath)
2191             video_hls_playlist = ''
2192         else:
2193             video_playpath = ''
2194             video_hls_playlist = (
2195                 video_filepath + video_file
2196             ).replace('.f4m', '.m3u8')
2197
2198         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2199         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2200
2201         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2202             webpage, u'title')
2203
2204         return [{
2205             'id':                 video_id,
2206             'url':                video_url,
2207             'tc_url':             video_url,
2208             'uploader':           None,
2209             'upload_date':        None,
2210             'title':              video_title,
2211             'ext':                u'flv',
2212             'play_path':          video_playpath,
2213             'video_file':         video_file,
2214             'video_hls_playlist': video_hls_playlist,
2215             'player_url':         video_swfobj,
2216         }]
2217
2218
2219 class ComedyCentralIE(InfoExtractor):
2220     """Information extractor for The Daily Show and Colbert Report """
2221
2222     # urls can be abbreviations like :thedailyshow or :colbert
2223     # urls for episodes like:
2224     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2225     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2226     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2227     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2228                       |(https?://)?(www\.)?
2229                           (?P<showname>thedailyshow|colbertnation)\.com/
2230                          (full-episodes/(?P<episode>.*)|
2231                           (?P<clip>
2232                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2233                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2234                      $"""
2235
2236     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2237
2238     _video_extensions = {
2239         '3500': 'mp4',
2240         '2200': 'mp4',
2241         '1700': 'mp4',
2242         '1200': 'mp4',
2243         '750': 'mp4',
2244         '400': 'mp4',
2245     }
2246     _video_dimensions = {
2247         '3500': '1280x720',
2248         '2200': '960x540',
2249         '1700': '768x432',
2250         '1200': '640x360',
2251         '750': '512x288',
2252         '400': '384x216',
2253     }
2254
2255     @classmethod
2256     def suitable(cls, url):
2257         """Receives a URL and returns True if suitable for this IE."""
2258         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2259
2260     def _print_formats(self, formats):
2261         print('Available formats:')
2262         for x in formats:
2263             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2264
2265
2266     def _real_extract(self, url):
2267         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2268         if mobj is None:
2269             raise ExtractorError(u'Invalid URL: %s' % url)
2270
2271         if mobj.group('shortname'):
2272             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2273                 url = u'http://www.thedailyshow.com/full-episodes/'
2274             else:
2275                 url = u'http://www.colbertnation.com/full-episodes/'
2276             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2277             assert mobj is not None
2278
2279         if mobj.group('clip'):
2280             if mobj.group('showname') == 'thedailyshow':
2281                 epTitle = mobj.group('tdstitle')
2282             else:
2283                 epTitle = mobj.group('cntitle')
2284             dlNewest = False
2285         else:
2286             dlNewest = not mobj.group('episode')
2287             if dlNewest:
2288                 epTitle = mobj.group('showname')
2289             else:
2290                 epTitle = mobj.group('episode')
2291
2292         self.report_extraction(epTitle)
2293         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2294         if dlNewest:
2295             url = htmlHandle.geturl()
2296             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2297             if mobj is None:
2298                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2299             if mobj.group('episode') == '':
2300                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2301             epTitle = mobj.group('episode')
2302
2303         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2304
2305         if len(mMovieParams) == 0:
2306             # The Colbert Report embeds the information in a without
2307             # a URL prefix; so extract the alternate reference
2308             # and then add the URL prefix manually.
2309
2310             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2311             if len(altMovieParams) == 0:
2312                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2313             else:
2314                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2315
2316         uri = mMovieParams[0][1]
2317         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2318         indexXml = self._download_webpage(indexUrl, epTitle,
2319                                           u'Downloading show index',
2320                                           u'unable to download episode index')
2321
2322         results = []
2323
2324         idoc = xml.etree.ElementTree.fromstring(indexXml)
2325         itemEls = idoc.findall('.//item')
2326         for partNum,itemEl in enumerate(itemEls):
2327             mediaId = itemEl.findall('./guid')[0].text
2328             shortMediaId = mediaId.split(':')[-1]
2329             showId = mediaId.split(':')[-2].replace('.com', '')
2330             officialTitle = itemEl.findall('./title')[0].text
2331             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2332
2333             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2334                         compat_urllib_parse.urlencode({'uri': mediaId}))
2335             configXml = self._download_webpage(configUrl, epTitle,
2336                                                u'Downloading configuration for %s' % shortMediaId)
2337
2338             cdoc = xml.etree.ElementTree.fromstring(configXml)
2339             turls = []
2340             for rendition in cdoc.findall('.//rendition'):
2341                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2342                 turls.append(finfo)
2343
2344             if len(turls) == 0:
2345                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2346                 continue
2347
2348             if self._downloader.params.get('listformats', None):
2349                 self._print_formats([i[0] for i in turls])
2350                 return
2351
2352             # For now, just pick the highest bitrate
2353             format,rtmp_video_url = turls[-1]
2354
2355             # Get the format arg from the arg stream
2356             req_format = self._downloader.params.get('format', None)
2357
2358             # Select format if we can find one
2359             for f,v in turls:
2360                 if f == req_format:
2361                     format, rtmp_video_url = f, v
2362                     break
2363
2364             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2365             if not m:
2366                 raise ExtractorError(u'Cannot transform RTMP url')
2367             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2368             video_url = base + m.group('finalid')
2369
2370             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2371             info = {
2372                 'id': shortMediaId,
2373                 'url': video_url,
2374                 'uploader': showId,
2375                 'upload_date': officialDate,
2376                 'title': effTitle,
2377                 'ext': 'mp4',
2378                 'format': format,
2379                 'thumbnail': None,
2380                 'description': officialTitle,
2381             }
2382             results.append(info)
2383
2384         return results
2385
2386
2387 class EscapistIE(InfoExtractor):
2388     """Information extractor for The Escapist """
2389
2390     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2391     IE_NAME = u'escapist'
2392
2393     def _real_extract(self, url):
2394         mobj = re.match(self._VALID_URL, url)
2395         if mobj is None:
2396             raise ExtractorError(u'Invalid URL: %s' % url)
2397         showName = mobj.group('showname')
2398         videoId = mobj.group('episode')
2399
2400         self.report_extraction(videoId)
2401         webpage = self._download_webpage(url, videoId)
2402
2403         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2404             webpage, u'description', fatal=False)
2405
2406         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2407             webpage, u'thumbnail', fatal=False)
2408
2409         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2410             webpage, u'player url')
2411
2412         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2413             webpage, u'player url').split(' : ')[-1]
2414
2415         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2416         configUrl = compat_urllib_parse.unquote(configUrl)
2417
2418         configJSON = self._download_webpage(configUrl, videoId,
2419                                             u'Downloading configuration',
2420                                             u'unable to download configuration')
2421
2422         # Technically, it's JavaScript, not JSON
2423         configJSON = configJSON.replace("'", '"')
2424
2425         try:
2426             config = json.loads(configJSON)
2427         except (ValueError,) as err:
2428             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2429
2430         playlist = config['playlist']
2431         videoUrl = playlist[1]['url']
2432
2433         info = {
2434             'id': videoId,
2435             'url': videoUrl,
2436             'uploader': showName,
2437             'upload_date': None,
2438             'title': title,
2439             'ext': 'mp4',
2440             'thumbnail': imgUrl,
2441             'description': videoDesc,
2442             'player_url': playerUrl,
2443         }
2444
2445         return [info]
2446
2447 class CollegeHumorIE(InfoExtractor):
2448     """Information extractor for collegehumor.com"""
2449
2450     _WORKING = False
2451     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2452     IE_NAME = u'collegehumor'
2453
2454     def report_manifest(self, video_id):
2455         """Report information extraction."""
2456         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2457
2458     def _real_extract(self, url):
2459         mobj = re.match(self._VALID_URL, url)
2460         if mobj is None:
2461             raise ExtractorError(u'Invalid URL: %s' % url)
2462         video_id = mobj.group('videoid')
2463
2464         info = {
2465             'id': video_id,
2466             'uploader': None,
2467             'upload_date': None,
2468         }
2469
2470         self.report_extraction(video_id)
2471         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2472         try:
2473             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2474         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2475             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2476
2477         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2478         try:
2479             videoNode = mdoc.findall('./video')[0]
2480             info['description'] = videoNode.findall('./description')[0].text
2481             info['title'] = videoNode.findall('./caption')[0].text
2482             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2483             manifest_url = videoNode.findall('./file')[0].text
2484         except IndexError:
2485             raise ExtractorError(u'Invalid metadata XML file')
2486
2487         manifest_url += '?hdcore=2.10.3'
2488         self.report_manifest(video_id)
2489         try:
2490             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2491         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2492             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2493
2494         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2495         try:
2496             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2497             node_id = media_node.attrib['url']
2498             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2499         except IndexError as err:
2500             raise ExtractorError(u'Invalid manifest file')
2501
2502         url_pr = compat_urllib_parse_urlparse(manifest_url)
2503         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2504
2505         info['url'] = url
2506         info['ext'] = 'f4f'
2507         return [info]
2508
2509
2510 class XVideosIE(InfoExtractor):
2511     """Information extractor for xvideos.com"""
2512
2513     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2514     IE_NAME = u'xvideos'
2515
2516     def _real_extract(self, url):
2517         mobj = re.match(self._VALID_URL, url)
2518         if mobj is None:
2519             raise ExtractorError(u'Invalid URL: %s' % url)
2520         video_id = mobj.group(1)
2521
2522         webpage = self._download_webpage(url, video_id)
2523
2524         self.report_extraction(video_id)
2525
2526         # Extract video URL
2527         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2528             webpage, u'video URL'))
2529
2530         # Extract title
2531         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2532             webpage, u'title')
2533
2534         # Extract video thumbnail
2535         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2536             webpage, u'thumbnail', fatal=False)
2537
2538         info = {
2539             'id': video_id,
2540             'url': video_url,
2541             'uploader': None,
2542             'upload_date': None,
2543             'title': video_title,
2544             'ext': 'flv',
2545             'thumbnail': video_thumbnail,
2546             'description': None,
2547         }
2548
2549         return [info]
2550
2551
2552 class SoundcloudIE(InfoExtractor):
2553     """Information extractor for soundcloud.com
2554        To access the media, the uid of the song and a stream token
2555        must be extracted from the page source and the script must make
2556        a request to media.soundcloud.com/crossdomain.xml. Then
2557        the media can be grabbed by requesting from an url composed
2558        of the stream token and uid
2559      """
2560
2561     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2562     IE_NAME = u'soundcloud'
2563
2564     def report_resolve(self, video_id):
2565         """Report information extraction."""
2566         self.to_screen(u'%s: Resolving id' % video_id)
2567
2568     def _real_extract(self, url):
2569         mobj = re.match(self._VALID_URL, url)
2570         if mobj is None:
2571             raise ExtractorError(u'Invalid URL: %s' % url)
2572
2573         # extract uploader (which is in the url)
2574         uploader = mobj.group(1)
2575         # extract simple title (uploader + slug of song title)
2576         slug_title =  mobj.group(2)
2577         simple_title = uploader + u'-' + slug_title
2578         full_title = '%s/%s' % (uploader, slug_title)
2579
2580         self.report_resolve(full_title)
2581
2582         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2583         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2584         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2585
2586         info = json.loads(info_json)
2587         video_id = info['id']
2588         self.report_extraction(full_title)
2589
2590         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2591         stream_json = self._download_webpage(streams_url, full_title,
2592                                              u'Downloading stream definitions',
2593                                              u'unable to download stream definitions')
2594
2595         streams = json.loads(stream_json)
2596         mediaURL = streams['http_mp3_128_url']
2597         upload_date = unified_strdate(info['created_at'])
2598
2599         return [{
2600             'id':       info['id'],
2601             'url':      mediaURL,
2602             'uploader': info['user']['username'],
2603             'upload_date': upload_date,
2604             'title':    info['title'],
2605             'ext':      u'mp3',
2606             'description': info['description'],
2607         }]
2608
2609 class SoundcloudSetIE(InfoExtractor):
2610     """Information extractor for soundcloud.com sets
2611        To access the media, the uid of the song and a stream token
2612        must be extracted from the page source and the script must make
2613        a request to media.soundcloud.com/crossdomain.xml. Then
2614        the media can be grabbed by requesting from an url composed
2615        of the stream token and uid
2616      """
2617
2618     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2619     IE_NAME = u'soundcloud:set'
2620
2621     def report_resolve(self, video_id):
2622         """Report information extraction."""
2623         self.to_screen(u'%s: Resolving id' % video_id)
2624
2625     def _real_extract(self, url):
2626         mobj = re.match(self._VALID_URL, url)
2627         if mobj is None:
2628             raise ExtractorError(u'Invalid URL: %s' % url)
2629
2630         # extract uploader (which is in the url)
2631         uploader = mobj.group(1)
2632         # extract simple title (uploader + slug of song title)
2633         slug_title =  mobj.group(2)
2634         simple_title = uploader + u'-' + slug_title
2635         full_title = '%s/sets/%s' % (uploader, slug_title)
2636
2637         self.report_resolve(full_title)
2638
2639         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2640         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2641         info_json = self._download_webpage(resolv_url, full_title)
2642
2643         videos = []
2644         info = json.loads(info_json)
2645         if 'errors' in info:
2646             for err in info['errors']:
2647                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2648             return
2649
2650         self.report_extraction(full_title)
2651         for track in info['tracks']:
2652             video_id = track['id']
2653
2654             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2655             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2656
2657             self.report_extraction(video_id)
2658             streams = json.loads(stream_json)
2659             mediaURL = streams['http_mp3_128_url']
2660
2661             videos.append({
2662                 'id':       video_id,
2663                 'url':      mediaURL,
2664                 'uploader': track['user']['username'],
2665                 'upload_date':  unified_strdate(track['created_at']),
2666                 'title':    track['title'],
2667                 'ext':      u'mp3',
2668                 'description': track['description'],
2669             })
2670         return videos
2671
2672
2673 class InfoQIE(InfoExtractor):
2674     """Information extractor for infoq.com"""
2675     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2676
2677     def _real_extract(self, url):
2678         mobj = re.match(self._VALID_URL, url)
2679         if mobj is None:
2680             raise ExtractorError(u'Invalid URL: %s' % url)
2681
2682         webpage = self._download_webpage(url, video_id=url)
2683         self.report_extraction(url)
2684
2685         # Extract video URL
2686         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2687         if mobj is None:
2688             raise ExtractorError(u'Unable to extract video url')
2689         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2690         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2691
2692         # Extract title
2693         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2694             webpage, u'title')
2695
2696         # Extract description
2697         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2698             webpage, u'description', fatal=False)
2699
2700         video_filename = video_url.split('/')[-1]
2701         video_id, extension = video_filename.split('.')
2702
2703         info = {
2704             'id': video_id,
2705             'url': video_url,
2706             'uploader': None,
2707             'upload_date': None,
2708             'title': video_title,
2709             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2710             'thumbnail': None,
2711             'description': video_description,
2712         }
2713
2714         return [info]
2715
2716 class MixcloudIE(InfoExtractor):
2717     """Information extractor for www.mixcloud.com"""
2718
2719     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2720     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2721     IE_NAME = u'mixcloud'
2722
2723     def report_download_json(self, file_id):
2724         """Report JSON download."""
2725         self.to_screen(u'Downloading json')
2726
2727     def get_urls(self, jsonData, fmt, bitrate='best'):
2728         """Get urls from 'audio_formats' section in json"""
2729         file_url = None
2730         try:
2731             bitrate_list = jsonData[fmt]
2732             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2733                 bitrate = max(bitrate_list) # select highest
2734
2735             url_list = jsonData[fmt][bitrate]
2736         except TypeError: # we have no bitrate info.
2737             url_list = jsonData[fmt]
2738         return url_list
2739
2740     def check_urls(self, url_list):
2741         """Returns 1st active url from list"""
2742         for url in url_list:
2743             try:
2744                 compat_urllib_request.urlopen(url)
2745                 return url
2746             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2747                 url = None
2748
2749         return None
2750
2751     def _print_formats(self, formats):
2752         print('Available formats:')
2753         for fmt in formats.keys():
2754             for b in formats[fmt]:
2755                 try:
2756                     ext = formats[fmt][b][0]
2757                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2758                 except TypeError: # we have no bitrate info
2759                     ext = formats[fmt][0]
2760                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2761                     break
2762
2763     def _real_extract(self, url):
2764         mobj = re.match(self._VALID_URL, url)
2765         if mobj is None:
2766             raise ExtractorError(u'Invalid URL: %s' % url)
2767         # extract uploader & filename from url
2768         uploader = mobj.group(1).decode('utf-8')
2769         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2770
2771         # construct API request
2772         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2773         # retrieve .json file with links to files
2774         request = compat_urllib_request.Request(file_url)
2775         try:
2776             self.report_download_json(file_url)
2777             jsonData = compat_urllib_request.urlopen(request).read()
2778         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2779             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2780
2781         # parse JSON
2782         json_data = json.loads(jsonData)
2783         player_url = json_data['player_swf_url']
2784         formats = dict(json_data['audio_formats'])
2785
2786         req_format = self._downloader.params.get('format', None)
2787         bitrate = None
2788
2789         if self._downloader.params.get('listformats', None):
2790             self._print_formats(formats)
2791             return
2792
2793         if req_format is None or req_format == 'best':
2794             for format_param in formats.keys():
2795                 url_list = self.get_urls(formats, format_param)
2796                 # check urls
2797                 file_url = self.check_urls(url_list)
2798                 if file_url is not None:
2799                     break # got it!
2800         else:
2801             if req_format not in formats:
2802                 raise ExtractorError(u'Format is not available')
2803
2804             url_list = self.get_urls(formats, req_format)
2805             file_url = self.check_urls(url_list)
2806             format_param = req_format
2807
2808         return [{
2809             'id': file_id.decode('utf-8'),
2810             'url': file_url.decode('utf-8'),
2811             'uploader': uploader.decode('utf-8'),
2812             'upload_date': None,
2813             'title': json_data['name'],
2814             'ext': file_url.split('.')[-1].decode('utf-8'),
2815             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2816             'thumbnail': json_data['thumbnail_url'],
2817             'description': json_data['description'],
2818             'player_url': player_url.decode('utf-8'),
2819         }]
2820
2821 class StanfordOpenClassroomIE(InfoExtractor):
2822     """Information extractor for Stanford's Open ClassRoom"""
2823
2824     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2825     IE_NAME = u'stanfordoc'
2826
2827     def _real_extract(self, url):
2828         mobj = re.match(self._VALID_URL, url)
2829         if mobj is None:
2830             raise ExtractorError(u'Invalid URL: %s' % url)
2831
2832         if mobj.group('course') and mobj.group('video'): # A specific video
2833             course = mobj.group('course')
2834             video = mobj.group('video')
2835             info = {
2836                 'id': course + '_' + video,
2837                 'uploader': None,
2838                 'upload_date': None,
2839             }
2840
2841             self.report_extraction(info['id'])
2842             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2843             xmlUrl = baseUrl + video + '.xml'
2844             try:
2845                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2846             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2847                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2848             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2849             try:
2850                 info['title'] = mdoc.findall('./title')[0].text
2851                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2852             except IndexError:
2853                 raise ExtractorError(u'Invalid metadata XML file')
2854             info['ext'] = info['url'].rpartition('.')[2]
2855             return [info]
2856         elif mobj.group('course'): # A course page
2857             course = mobj.group('course')
2858             info = {
2859                 'id': course,
2860                 'type': 'playlist',
2861                 'uploader': None,
2862                 'upload_date': None,
2863             }
2864
2865             coursepage = self._download_webpage(url, info['id'],
2866                                         note='Downloading course info page',
2867                                         errnote='Unable to download course info page')
2868
2869             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2870
2871             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2872                 coursepage, u'description', fatal=False)
2873
2874             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2875             info['list'] = [
2876                 {
2877                     'type': 'reference',
2878                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2879                 }
2880                     for vpage in links]
2881             results = []
2882             for entry in info['list']:
2883                 assert entry['type'] == 'reference'
2884                 results += self.extract(entry['url'])
2885             return results
2886         else: # Root page
2887             info = {
2888                 'id': 'Stanford OpenClassroom',
2889                 'type': 'playlist',
2890                 'uploader': None,
2891                 'upload_date': None,
2892             }
2893
2894             self.report_download_webpage(info['id'])
2895             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2896             try:
2897                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2898             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2899                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2900
2901             info['title'] = info['id']
2902
2903             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2904             info['list'] = [
2905                 {
2906                     'type': 'reference',
2907                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2908                 }
2909                     for cpage in links]
2910
2911             results = []
2912             for entry in info['list']:
2913                 assert entry['type'] == 'reference'
2914                 results += self.extract(entry['url'])
2915             return results
2916
2917 class MTVIE(InfoExtractor):
2918     """Information extractor for MTV.com"""
2919
2920     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2921     IE_NAME = u'mtv'
2922
2923     def _real_extract(self, url):
2924         mobj = re.match(self._VALID_URL, url)
2925         if mobj is None:
2926             raise ExtractorError(u'Invalid URL: %s' % url)
2927         if not mobj.group('proto'):
2928             url = 'http://' + url
2929         video_id = mobj.group('videoid')
2930
2931         webpage = self._download_webpage(url, video_id)
2932
2933         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2934             webpage, u'song name', fatal=False)
2935
2936         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2937             webpage, u'title')
2938
2939         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2940             webpage, u'mtvn_uri', fatal=False)
2941
2942         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2943             webpage, u'content id', fatal=False)
2944
2945         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2946         self.report_extraction(video_id)
2947         request = compat_urllib_request.Request(videogen_url)
2948         try:
2949             metadataXml = compat_urllib_request.urlopen(request).read()
2950         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2951             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2952
2953         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2954         renditions = mdoc.findall('.//rendition')
2955
2956         # For now, always pick the highest quality.
2957         rendition = renditions[-1]
2958
2959         try:
2960             _,_,ext = rendition.attrib['type'].partition('/')
2961             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2962             video_url = rendition.find('./src').text
2963         except KeyError:
2964             raise ExtractorError('Invalid rendition field.')
2965
2966         info = {
2967             'id': video_id,
2968             'url': video_url,
2969             'uploader': performer,
2970             'upload_date': None,
2971             'title': video_title,
2972             'ext': ext,
2973             'format': format,
2974         }
2975
2976         return [info]
2977
2978
2979 class YoukuIE(InfoExtractor):
2980     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2981
2982     def _gen_sid(self):
2983         nowTime = int(time.time() * 1000)
2984         random1 = random.randint(1000,1998)
2985         random2 = random.randint(1000,9999)
2986
2987         return "%d%d%d" %(nowTime,random1,random2)
2988
2989     def _get_file_ID_mix_string(self, seed):
2990         mixed = []
2991         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2992         seed = float(seed)
2993         for i in range(len(source)):
2994             seed  =  (seed * 211 + 30031 ) % 65536
2995             index  =  math.floor(seed / 65536 * len(source) )
2996             mixed.append(source[int(index)])
2997             source.remove(source[int(index)])
2998         #return ''.join(mixed)
2999         return mixed
3000
3001     def _get_file_id(self, fileId, seed):
3002         mixed = self._get_file_ID_mix_string(seed)
3003         ids = fileId.split('*')
3004         realId = []
3005         for ch in ids:
3006             if ch:
3007                 realId.append(mixed[int(ch)])
3008         return ''.join(realId)
3009
3010     def _real_extract(self, url):
3011         mobj = re.match(self._VALID_URL, url)
3012         if mobj is None:
3013             raise ExtractorError(u'Invalid URL: %s' % url)
3014         video_id = mobj.group('ID')
3015
3016         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3017
3018         jsondata = self._download_webpage(info_url, video_id)
3019
3020         self.report_extraction(video_id)
3021         try:
3022             config = json.loads(jsondata)
3023
3024             video_title =  config['data'][0]['title']
3025             seed = config['data'][0]['seed']
3026
3027             format = self._downloader.params.get('format', None)
3028             supported_format = list(config['data'][0]['streamfileids'].keys())
3029
3030             if format is None or format == 'best':
3031                 if 'hd2' in supported_format:
3032                     format = 'hd2'
3033                 else:
3034                     format = 'flv'
3035                 ext = u'flv'
3036             elif format == 'worst':
3037                 format = 'mp4'
3038                 ext = u'mp4'
3039             else:
3040                 format = 'flv'
3041                 ext = u'flv'
3042
3043
3044             fileid = config['data'][0]['streamfileids'][format]
3045             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3046         except (UnicodeDecodeError, ValueError, KeyError):
3047             raise ExtractorError(u'Unable to extract info section')
3048
3049         files_info=[]
3050         sid = self._gen_sid()
3051         fileid = self._get_file_id(fileid, seed)
3052
3053         #column 8,9 of fileid represent the segment number
3054         #fileid[7:9] should be changed
3055         for index, key in enumerate(keys):
3056
3057             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3058             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3059
3060             info = {
3061                 'id': '%s_part%02d' % (video_id, index),
3062                 'url': download_url,
3063                 'uploader': None,
3064                 'upload_date': None,
3065                 'title': video_title,
3066                 'ext': ext,
3067             }
3068             files_info.append(info)
3069
3070         return files_info
3071
3072
3073 class XNXXIE(InfoExtractor):
3074     """Information extractor for xnxx.com"""
3075
3076     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3077     IE_NAME = u'xnxx'
3078     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3079     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3080     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3081
3082     def _real_extract(self, url):
3083         mobj = re.match(self._VALID_URL, url)
3084         if mobj is None:
3085             raise ExtractorError(u'Invalid URL: %s' % url)
3086         video_id = mobj.group(1)
3087
3088         # Get webpage content
3089         webpage = self._download_webpage(url, video_id)
3090
3091         video_url = self._search_regex(self.VIDEO_URL_RE,
3092             webpage, u'video URL')
3093         video_url = compat_urllib_parse.unquote(video_url)
3094
3095         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3096             webpage, u'title')
3097
3098         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3099             webpage, u'thumbnail', fatal=False)
3100
3101         return [{
3102             'id': video_id,
3103             'url': video_url,
3104             'uploader': None,
3105             'upload_date': None,
3106             'title': video_title,
3107             'ext': 'flv',
3108             'thumbnail': video_thumbnail,
3109             'description': None,
3110         }]
3111
3112
3113 class GooglePlusIE(InfoExtractor):
3114     """Information extractor for plus.google.com."""
3115
3116     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3117     IE_NAME = u'plus.google'
3118
3119     def _real_extract(self, url):
3120         # Extract id from URL
3121         mobj = re.match(self._VALID_URL, url)
3122         if mobj is None:
3123             raise ExtractorError(u'Invalid URL: %s' % url)
3124
3125         post_url = mobj.group(0)
3126         video_id = mobj.group(1)
3127
3128         video_extension = 'flv'
3129
3130         # Step 1, Retrieve post webpage to extract further information
3131         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3132
3133         self.report_extraction(video_id)
3134
3135         # Extract update date
3136         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3137             webpage, u'upload date', fatal=False)
3138         if upload_date:
3139             # Convert timestring to a format suitable for filename
3140             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3141             upload_date = upload_date.strftime('%Y%m%d')
3142
3143         # Extract uploader
3144         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3145             webpage, u'uploader', fatal=False)
3146
3147         # Extract title
3148         # Get the first line for title
3149         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3150             webpage, 'title', default=u'NA')
3151
3152         # Step 2, Stimulate clicking the image box to launch video
3153         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3154             webpage, u'video page URL')
3155         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3156
3157         # Extract video links on video page
3158         """Extract video links of all sizes"""
3159         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3160         mobj = re.findall(pattern, webpage)
3161         if len(mobj) == 0:
3162             raise ExtractorError(u'Unable to extract video links')
3163
3164         # Sort in resolution
3165         links = sorted(mobj)
3166
3167         # Choose the lowest of the sort, i.e. highest resolution
3168         video_url = links[-1]
3169         # Only get the url. The resolution part in the tuple has no use anymore
3170         video_url = video_url[-1]
3171         # Treat escaped \u0026 style hex
3172         try:
3173             video_url = video_url.decode("unicode_escape")
3174         except AttributeError: # Python 3
3175             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3176
3177
3178         return [{
3179             'id':       video_id,
3180             'url':      video_url,
3181             'uploader': uploader,
3182             'upload_date':  upload_date,
3183             'title':    video_title,
3184             'ext':      video_extension,
3185         }]
3186
3187 class NBAIE(InfoExtractor):
3188     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3189     IE_NAME = u'nba'
3190
3191     def _real_extract(self, url):
3192         mobj = re.match(self._VALID_URL, url)
3193         if mobj is None:
3194             raise ExtractorError(u'Invalid URL: %s' % url)
3195
3196         video_id = mobj.group(1)
3197
3198         webpage = self._download_webpage(url, video_id)
3199
3200         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3201
3202         shortened_video_id = video_id.rpartition('/')[2]
3203         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3204             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3205
3206         # It isn't there in the HTML it returns to us
3207         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3208
3209         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3210
3211         info = {
3212             'id': shortened_video_id,
3213             'url': video_url,
3214             'ext': 'mp4',
3215             'title': title,
3216             # 'uploader_date': uploader_date,
3217             'description': description,
3218         }
3219         return [info]
3220
3221 class JustinTVIE(InfoExtractor):
3222     """Information extractor for justin.tv and twitch.tv"""
3223     # TODO: One broadcast may be split into multiple videos. The key
3224     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3225     # starts at 1 and increases. Can we treat all parts as one video?
3226
3227     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3228         (?:
3229             (?P<channelid>[^/]+)|
3230             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3231             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3232         )
3233         /?(?:\#.*)?$
3234         """
3235     _JUSTIN_PAGE_LIMIT = 100
3236     IE_NAME = u'justin.tv'
3237
3238     def report_download_page(self, channel, offset):
3239         """Report attempt to download a single page of videos."""
3240         self.to_screen(u'%s: Downloading video information from %d to %d' %
3241                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3242
3243     # Return count of items, list of *valid* items
3244     def _parse_page(self, url, video_id):
3245         webpage = self._download_webpage(url, video_id,
3246                                          u'Downloading video info JSON',
3247                                          u'unable to download video info JSON')
3248
3249         response = json.loads(webpage)
3250         if type(response) != list:
3251             error_text = response.get('error', 'unknown error')
3252             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3253         info = []
3254         for clip in response:
3255             video_url = clip['video_file_url']
3256             if video_url:
3257                 video_extension = os.path.splitext(video_url)[1][1:]
3258                 video_date = re.sub('-', '', clip['start_time'][:10])
3259                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3260                 video_id = clip['id']
3261                 video_title = clip.get('title', video_id)
3262                 info.append({
3263                     'id': video_id,
3264                     'url': video_url,
3265                     'title': video_title,
3266                     'uploader': clip.get('channel_name', video_uploader_id),
3267                     'uploader_id': video_uploader_id,
3268                     'upload_date': video_date,
3269                     'ext': video_extension,
3270                 })
3271         return (len(response), info)
3272
3273     def _real_extract(self, url):
3274         mobj = re.match(self._VALID_URL, url)
3275         if mobj is None:
3276             raise ExtractorError(u'invalid URL: %s' % url)
3277
3278         api_base = 'http://api.justin.tv'
3279         paged = False
3280         if mobj.group('channelid'):
3281             paged = True
3282             video_id = mobj.group('channelid')
3283             api = api_base + '/channel/archives/%s.json' % video_id
3284         elif mobj.group('chapterid'):
3285             chapter_id = mobj.group('chapterid')
3286
3287             webpage = self._download_webpage(url, chapter_id)
3288             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3289             if not m:
3290                 raise ExtractorError(u'Cannot find archive of a chapter')
3291             archive_id = m.group(1)
3292
3293             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3294             chapter_info_xml = self._download_webpage(api, chapter_id,
3295                                              note=u'Downloading chapter information',
3296                                              errnote=u'Chapter information download failed')
3297             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3298             for a in doc.findall('.//archive'):
3299                 if archive_id == a.find('./id').text:
3300                     break
3301             else:
3302                 raise ExtractorError(u'Could not find chapter in chapter information')
3303
3304             video_url = a.find('./video_file_url').text
3305             video_ext = video_url.rpartition('.')[2] or u'flv'
3306
3307             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3308             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3309                                    note='Downloading chapter metadata',
3310                                    errnote='Download of chapter metadata failed')
3311             chapter_info = json.loads(chapter_info_json)
3312
3313             bracket_start = int(doc.find('.//bracket_start').text)
3314             bracket_end = int(doc.find('.//bracket_end').text)
3315
3316             # TODO determine start (and probably fix up file)
3317             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3318             #video_url += u'?start=' + TODO:start_timestamp
3319             # bracket_start is 13290, but we want 51670615
3320             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3321                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3322
3323             info = {
3324                 'id': u'c' + chapter_id,
3325                 'url': video_url,
3326                 'ext': video_ext,
3327                 'title': chapter_info['title'],
3328                 'thumbnail': chapter_info['preview'],
3329                 'description': chapter_info['description'],
3330                 'uploader': chapter_info['channel']['display_name'],
3331                 'uploader_id': chapter_info['channel']['name'],
3332             }
3333             return [info]
3334         else:
3335             video_id = mobj.group('videoid')
3336             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3337
3338         self.report_extraction(video_id)
3339
3340         info = []
3341         offset = 0
3342         limit = self._JUSTIN_PAGE_LIMIT
3343         while True:
3344             if paged:
3345                 self.report_download_page(video_id, offset)
3346             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3347             page_count, page_info = self._parse_page(page_url, video_id)
3348             info.extend(page_info)
3349             if not paged or page_count != limit:
3350                 break
3351             offset += limit
3352         return info
3353
3354 class FunnyOrDieIE(InfoExtractor):
3355     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3356
3357     def _real_extract(self, url):
3358         mobj = re.match(self._VALID_URL, url)
3359         if mobj is None:
3360             raise ExtractorError(u'invalid URL: %s' % url)
3361
3362         video_id = mobj.group('id')
3363         webpage = self._download_webpage(url, video_id)
3364
3365         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3366             webpage, u'video URL', flags=re.DOTALL)
3367
3368         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3369             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3370
3371         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3372             webpage, u'description', fatal=False, flags=re.DOTALL)
3373
3374         info = {
3375             'id': video_id,
3376             'url': video_url,
3377             'ext': 'mp4',
3378             'title': title,
3379             'description': video_description,
3380         }
3381         return [info]
3382
3383 class SteamIE(InfoExtractor):
3384     _VALID_URL = r"""http://store\.steampowered\.com/
3385                 (agecheck/)?
3386                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3387                 (?P<gameID>\d+)/?
3388                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3389                 """
3390     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3391     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3392
3393     @classmethod
3394     def suitable(cls, url):
3395         """Receives a URL and returns True if suitable for this IE."""
3396         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3397
3398     def _real_extract(self, url):
3399         m = re.match(self._VALID_URL, url, re.VERBOSE)
3400         gameID = m.group('gameID')
3401
3402         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3403         webpage = self._download_webpage(videourl, gameID)
3404
3405         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3406             videourl = self._AGECHECK_TEMPLATE % gameID
3407             self.report_age_confirmation()
3408             webpage = self._download_webpage(videourl, gameID)
3409
3410         self.report_extraction(gameID)
3411         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3412                                              webpage, 'game title')
3413
3414         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3415         mweb = re.finditer(urlRE, webpage)
3416         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3417         titles = re.finditer(namesRE, webpage)
3418         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3419         thumbs = re.finditer(thumbsRE, webpage)
3420         videos = []
3421         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3422             video_id = vid.group('videoID')
3423             title = vtitle.group('videoName')
3424             video_url = vid.group('videoURL')
3425             video_thumb = thumb.group('thumbnail')
3426             if not video_url:
3427                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3428             info = {
3429                 'id':video_id,
3430                 'url':video_url,
3431                 'ext': 'flv',
3432                 'title': unescapeHTML(title),
3433                 'thumbnail': video_thumb
3434                   }
3435             videos.append(info)
3436         return [self.playlist_result(videos, gameID, game_title)]
3437
3438 class UstreamIE(InfoExtractor):
3439     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3440     IE_NAME = u'ustream'
3441
3442     def _real_extract(self, url):
3443         m = re.match(self._VALID_URL, url)
3444         video_id = m.group('videoID')
3445
3446         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3447         webpage = self._download_webpage(url, video_id)
3448
3449         self.report_extraction(video_id)
3450
3451         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3452             webpage, u'title')
3453
3454         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3455             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3456
3457         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3458             webpage, u'thumbnail', fatal=False)
3459
3460         info = {
3461                 'id': video_id,
3462                 'url': video_url,
3463                 'ext': 'flv',
3464                 'title': video_title,
3465                 'uploader': uploader,
3466                 'thumbnail': thumbnail,
3467                }
3468         return info
3469
3470 class WorldStarHipHopIE(InfoExtractor):
3471     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3472     IE_NAME = u'WorldStarHipHop'
3473
3474     def _real_extract(self, url):
3475         m = re.match(self._VALID_URL, url)
3476         video_id = m.group('id')
3477
3478         webpage_src = self._download_webpage(url, video_id)
3479
3480         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3481             webpage_src, u'video URL')
3482
3483         if 'mp4' in video_url:
3484             ext = 'mp4'
3485         else:
3486             ext = 'flv'
3487
3488         video_title = self._html_search_regex(r"<title>(.*)</title>",
3489             webpage_src, u'title')
3490
3491         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3492         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3493             webpage_src, u'thumbnail', fatal=False)
3494
3495         if not thumbnail:
3496             _title = r"""candytitles.*>(.*)</span>"""
3497             mobj = re.search(_title, webpage_src)
3498             if mobj is not None:
3499                 video_title = mobj.group(1)
3500
3501         results = [{
3502                     'id': video_id,
3503                     'url' : video_url,
3504                     'title' : video_title,
3505                     'thumbnail' : thumbnail,
3506                     'ext' : ext,
3507                     }]
3508         return results
3509
3510 class RBMARadioIE(InfoExtractor):
3511     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3512
3513     def _real_extract(self, url):
3514         m = re.match(self._VALID_URL, url)
3515         video_id = m.group('videoID')
3516
3517         webpage = self._download_webpage(url, video_id)
3518
3519         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3520             webpage, u'json data', flags=re.MULTILINE)
3521
3522         try:
3523             data = json.loads(json_data)
3524         except ValueError as e:
3525             raise ExtractorError(u'Invalid JSON: ' + str(e))
3526
3527         video_url = data['akamai_url'] + '&cbr=256'
3528         url_parts = compat_urllib_parse_urlparse(video_url)
3529         video_ext = url_parts.path.rpartition('.')[2]
3530         info = {
3531                 'id': video_id,
3532                 'url': video_url,
3533                 'ext': video_ext,
3534                 'title': data['title'],
3535                 'description': data.get('teaser_text'),
3536                 'location': data.get('country_of_origin'),
3537                 'uploader': data.get('host', {}).get('name'),
3538                 'uploader_id': data.get('host', {}).get('slug'),
3539                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3540                 'duration': data.get('duration'),
3541         }
3542         return [info]
3543
3544
3545 class YouPornIE(InfoExtractor):
3546     """Information extractor for youporn.com."""
3547     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3548
3549     def _print_formats(self, formats):
3550         """Print all available formats"""
3551         print(u'Available formats:')
3552         print(u'ext\t\tformat')
3553         print(u'---------------------------------')
3554         for format in formats:
3555             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3556
3557     def _specific(self, req_format, formats):
3558         for x in formats:
3559             if(x["format"]==req_format):
3560                 return x
3561         return None
3562
3563     def _real_extract(self, url):
3564         mobj = re.match(self._VALID_URL, url)
3565         if mobj is None:
3566             raise ExtractorError(u'Invalid URL: %s' % url)
3567         video_id = mobj.group('videoid')
3568
3569         req = compat_urllib_request.Request(url)
3570         req.add_header('Cookie', 'age_verified=1')
3571         webpage = self._download_webpage(req, video_id)
3572
3573         # Get JSON parameters
3574         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3575         try:
3576             params = json.loads(json_params)
3577         except:
3578             raise ExtractorError(u'Invalid JSON')
3579
3580         self.report_extraction(video_id)
3581         try:
3582             video_title = params['title']
3583             upload_date = unified_strdate(params['release_date_f'])
3584             video_description = params['description']
3585             video_uploader = params['submitted_by']
3586             thumbnail = params['thumbnails'][0]['image']
3587         except KeyError:
3588             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3589
3590         # Get all of the formats available
3591         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3592         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3593             webpage, u'download list').strip()
3594
3595         # Get all of the links from the page
3596         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3597         links = re.findall(LINK_RE, download_list_html)
3598         if(len(links) == 0):
3599             raise ExtractorError(u'ERROR: no known formats available for video')
3600
3601         self.to_screen(u'Links found: %d' % len(links))
3602
3603         formats = []
3604         for link in links:
3605
3606             # A link looks like this:
3607             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3608             # A path looks like this:
3609             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3610             video_url = unescapeHTML( link )
3611             path = compat_urllib_parse_urlparse( video_url ).path
3612             extension = os.path.splitext( path )[1][1:]
3613             format = path.split('/')[4].split('_')[:2]
3614             size = format[0]
3615             bitrate = format[1]
3616             format = "-".join( format )
3617             # title = u'%s-%s-%s' % (video_title, size, bitrate)
3618
3619             formats.append({
3620                 'id': video_id,
3621                 'url': video_url,
3622                 'uploader': video_uploader,
3623                 'upload_date': upload_date,
3624                 'title': video_title,
3625                 'ext': extension,
3626                 'format': format,
3627                 'thumbnail': thumbnail,
3628                 'description': video_description
3629             })
3630
3631         if self._downloader.params.get('listformats', None):
3632             self._print_formats(formats)
3633             return
3634
3635         req_format = self._downloader.params.get('format', None)
3636         self.to_screen(u'Format: %s' % req_format)
3637
3638         if req_format is None or req_format == 'best':
3639             return [formats[0]]
3640         elif req_format == 'worst':
3641             return [formats[-1]]
3642         elif req_format in ('-1', 'all'):
3643             return formats
3644         else:
3645             format = self._specific( req_format, formats )
3646             if result is None:
3647                 raise ExtractorError(u'Requested format not available')
3648             return [format]
3649
3650
3651
3652 class PornotubeIE(InfoExtractor):
3653     """Information extractor for pornotube.com."""
3654     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3655
3656     def _real_extract(self, url):
3657         mobj = re.match(self._VALID_URL, url)
3658         if mobj is None:
3659             raise ExtractorError(u'Invalid URL: %s' % url)
3660
3661         video_id = mobj.group('videoid')
3662         video_title = mobj.group('title')
3663
3664         # Get webpage content
3665         webpage = self._download_webpage(url, video_id)
3666
3667         # Get the video URL
3668         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3669         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3670         video_url = compat_urllib_parse.unquote(video_url)
3671
3672         #Get the uploaded date
3673         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3674         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3675         if upload_date: upload_date = unified_strdate(upload_date)
3676
3677         info = {'id': video_id,
3678                 'url': video_url,
3679                 'uploader': None,
3680                 'upload_date': upload_date,
3681                 'title': video_title,
3682                 'ext': 'flv',
3683                 'format': 'flv'}
3684
3685         return [info]
3686
3687 class YouJizzIE(InfoExtractor):
3688     """Information extractor for youjizz.com."""
3689     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3690
3691     def _real_extract(self, url):
3692         mobj = re.match(self._VALID_URL, url)
3693         if mobj is None:
3694             raise ExtractorError(u'Invalid URL: %s' % url)
3695
3696         video_id = mobj.group('videoid')
3697
3698         # Get webpage content
3699         webpage = self._download_webpage(url, video_id)
3700
3701         # Get the video title
3702         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3703             webpage, u'title').strip()
3704
3705         # Get the embed page
3706         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3707         if result is None:
3708             raise ExtractorError(u'ERROR: unable to extract embed page')
3709
3710         embed_page_url = result.group(0).strip()
3711         video_id = result.group('videoid')
3712
3713         webpage = self._download_webpage(embed_page_url, video_id)
3714
3715         # Get the video URL
3716         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3717             webpage, u'video URL')
3718
3719         info = {'id': video_id,
3720                 'url': video_url,
3721                 'title': video_title,
3722                 'ext': 'flv',
3723                 'format': 'flv',
3724                 'player_url': embed_page_url}
3725
3726         return [info]
3727
3728 class EightTracksIE(InfoExtractor):
3729     IE_NAME = '8tracks'
3730     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3731
3732     def _real_extract(self, url):
3733         mobj = re.match(self._VALID_URL, url)
3734         if mobj is None:
3735             raise ExtractorError(u'Invalid URL: %s' % url)
3736         playlist_id = mobj.group('id')
3737
3738         webpage = self._download_webpage(url, playlist_id)
3739
3740         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3741         data = json.loads(json_like)
3742
3743         session = str(random.randint(0, 1000000000))
3744         mix_id = data['id']
3745         track_count = data['tracks_count']
3746         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3747         next_url = first_url
3748         res = []
3749         for i in itertools.count():
3750             api_json = self._download_webpage(next_url, playlist_id,
3751                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3752                 errnote=u'Failed to download song information')
3753             api_data = json.loads(api_json)
3754             track_data = api_data[u'set']['track']
3755             info = {
3756                 'id': track_data['id'],
3757                 'url': track_data['track_file_stream_url'],
3758                 'title': track_data['performer'] + u' - ' + track_data['name'],
3759                 'raw_title': track_data['name'],
3760                 'uploader_id': data['user']['login'],
3761                 'ext': 'm4a',
3762             }
3763             res.append(info)
3764             if api_data['set']['at_last_track']:
3765                 break
3766             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3767         return res
3768
3769 class KeekIE(InfoExtractor):
3770     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3771     IE_NAME = u'keek'
3772
3773     def _real_extract(self, url):
3774         m = re.match(self._VALID_URL, url)
3775         video_id = m.group('videoID')
3776
3777         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3778         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3779         webpage = self._download_webpage(url, video_id)
3780
3781         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3782             webpage, u'title')
3783
3784         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3785             webpage, u'uploader', fatal=False)
3786
3787         info = {
3788                 'id': video_id,
3789                 'url': video_url,
3790                 'ext': 'mp4',
3791                 'title': video_title,
3792                 'thumbnail': thumbnail,
3793                 'uploader': uploader
3794         }
3795         return [info]
3796
3797 class TEDIE(InfoExtractor):
3798     _VALID_URL=r'''http://www\.ted\.com/
3799                    (
3800                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3801                         |
3802                         ((?P<type_talk>talks)) # We have a simple talk
3803                    )
3804                    (/lang/(.*?))? # The url may contain the language
3805                    /(?P<name>\w+) # Here goes the name and then ".html"
3806                    '''
3807
3808     @classmethod
3809     def suitable(cls, url):
3810         """Receives a URL and returns True if suitable for this IE."""
3811         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3812
3813     def _real_extract(self, url):
3814         m=re.match(self._VALID_URL, url, re.VERBOSE)
3815         if m.group('type_talk'):
3816             return [self._talk_info(url)]
3817         else :
3818             playlist_id=m.group('playlist_id')
3819             name=m.group('name')
3820             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3821             return [self._playlist_videos_info(url,name,playlist_id)]
3822
3823     def _playlist_videos_info(self,url,name,playlist_id=0):
3824         '''Returns the videos of the playlist'''
3825         video_RE=r'''
3826                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3827                      ([.\s]*?)data-playlist_item_id="(\d+)"
3828                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3829                      '''
3830         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3831         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3832         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3833         m_names=re.finditer(video_name_RE,webpage)
3834
3835         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3836                                                  webpage, 'playlist title')
3837
3838         playlist_entries = []
3839         for m_video, m_name in zip(m_videos,m_names):
3840             video_id=m_video.group('video_id')
3841             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3842             playlist_entries.append(self.url_result(talk_url, 'TED'))
3843         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3844
3845     def _talk_info(self, url, video_id=0):
3846         """Return the video for the talk in the url"""
3847         m = re.match(self._VALID_URL, url,re.VERBOSE)
3848         video_name = m.group('name')
3849         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3850         self.report_extraction(video_name)
3851         # If the url includes the language we get the title translated
3852         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3853                                         webpage, 'title')
3854         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3855                                     webpage, 'json data')
3856         info = json.loads(json_data)
3857         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3858                                        webpage, 'description', flags = re.DOTALL)
3859
3860         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3861                                        webpage, 'thumbnail')
3862         info = {
3863                 'id': info['id'],
3864                 'url': info['htmlStreams'][-1]['file'],
3865                 'ext': 'mp4',
3866                 'title': title,
3867                 'thumbnail': thumbnail,
3868                 'description': desc,
3869                 }
3870         return info
3871
3872 class MySpassIE(InfoExtractor):
3873     _VALID_URL = r'http://www.myspass.de/.*'
3874
3875     def _real_extract(self, url):
3876         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3877
3878         # video id is the last path element of the URL
3879         # usually there is a trailing slash, so also try the second but last
3880         url_path = compat_urllib_parse_urlparse(url).path
3881         url_parent_path, video_id = os.path.split(url_path)
3882         if not video_id:
3883             _, video_id = os.path.split(url_parent_path)
3884
3885         # get metadata
3886         metadata_url = META_DATA_URL_TEMPLATE % video_id
3887         metadata_text = self._download_webpage(metadata_url, video_id)
3888         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3889
3890         # extract values from metadata
3891         url_flv_el = metadata.find('url_flv')
3892         if url_flv_el is None:
3893             raise ExtractorError(u'Unable to extract download url')
3894         video_url = url_flv_el.text
3895         extension = os.path.splitext(video_url)[1][1:]
3896         title_el = metadata.find('title')
3897         if title_el is None:
3898             raise ExtractorError(u'Unable to extract title')
3899         title = title_el.text
3900         format_id_el = metadata.find('format_id')
3901         if format_id_el is None:
3902             format = ext
3903         else:
3904             format = format_id_el.text
3905         description_el = metadata.find('description')
3906         if description_el is not None:
3907             description = description_el.text
3908         else:
3909             description = None
3910         imagePreview_el = metadata.find('imagePreview')
3911         if imagePreview_el is not None:
3912             thumbnail = imagePreview_el.text
3913         else:
3914             thumbnail = None
3915         info = {
3916             'id': video_id,
3917             'url': video_url,
3918             'title': title,
3919             'ext': extension,
3920             'format': format,
3921             'thumbnail': thumbnail,
3922             'description': description
3923         }
3924         return [info]
3925
3926 class SpiegelIE(InfoExtractor):
3927     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3928
3929     def _real_extract(self, url):
3930         m = re.match(self._VALID_URL, url)
3931         video_id = m.group('videoID')
3932
3933         webpage = self._download_webpage(url, video_id)
3934
3935         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3936             webpage, u'title')
3937
3938         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3939         xml_code = self._download_webpage(xml_url, video_id,
3940                     note=u'Downloading XML', errnote=u'Failed to download XML')
3941
3942         idoc = xml.etree.ElementTree.fromstring(xml_code)
3943         last_type = idoc[-1]
3944         filename = last_type.findall('./filename')[0].text
3945         duration = float(last_type.findall('./duration')[0].text)
3946
3947         video_url = 'http://video2.spiegel.de/flash/' + filename
3948         video_ext = filename.rpartition('.')[2]
3949         info = {
3950             'id': video_id,
3951             'url': video_url,
3952             'ext': video_ext,
3953             'title': video_title,
3954             'duration': duration,
3955         }
3956         return [info]
3957
3958 class LiveLeakIE(InfoExtractor):
3959
3960     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3961     IE_NAME = u'liveleak'
3962
3963     def _real_extract(self, url):
3964         mobj = re.match(self._VALID_URL, url)
3965         if mobj is None:
3966             raise ExtractorError(u'Invalid URL: %s' % url)
3967
3968         video_id = mobj.group('video_id')
3969
3970         webpage = self._download_webpage(url, video_id)
3971
3972         video_url = self._search_regex(r'file: "(.*?)",',
3973             webpage, u'video URL')
3974
3975         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3976             webpage, u'title').replace('LiveLeak.com -', '').strip()
3977
3978         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3979             webpage, u'description', fatal=False)
3980
3981         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3982             webpage, u'uploader', fatal=False)
3983
3984         info = {
3985             'id':  video_id,
3986             'url': video_url,
3987             'ext': 'mp4',
3988             'title': video_title,
3989             'description': video_description,
3990             'uploader': video_uploader
3991         }
3992
3993         return [info]
3994
3995 class ARDIE(InfoExtractor):
3996     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3997     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3998     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3999
4000     def _real_extract(self, url):
4001         # determine video id from url
4002         m = re.match(self._VALID_URL, url)
4003
4004         numid = re.search(r'documentId=([0-9]+)', url)
4005         if numid:
4006             video_id = numid.group(1)
4007         else:
4008             video_id = m.group('video_id')
4009
4010         # determine title and media streams from webpage
4011         html = self._download_webpage(url, video_id)
4012         title = re.search(self._TITLE, html).group('title')
4013         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4014         if not streams:
4015             assert '"fsk"' in html
4016             raise ExtractorError(u'This video is only available after 8:00 pm')
4017
4018         # choose default media type and highest quality for now
4019         stream = max([s for s in streams if int(s["media_type"]) == 0],
4020                      key=lambda s: int(s["quality"]))
4021
4022         # there's two possibilities: RTMP stream or HTTP download
4023         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4024         if stream['rtmp_url']:
4025             self.to_screen(u'RTMP download detected')
4026             assert stream['video_url'].startswith('mp4:')
4027             info["url"] = stream["rtmp_url"]
4028             info["play_path"] = stream['video_url']
4029         else:
4030             assert stream["video_url"].endswith('.mp4')
4031             info["url"] = stream["video_url"]
4032         return [info]
4033
4034 class ZDFIE(InfoExtractor):
4035     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4036     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4037     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4038     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4039     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4040
4041     def _real_extract(self, url):
4042         mobj = re.match(self._VALID_URL, url)
4043         if mobj is None:
4044             raise ExtractorError(u'Invalid URL: %s' % url)
4045         video_id = mobj.group('video_id')
4046
4047         html = self._download_webpage(url, video_id)
4048         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4049         if streams is None:
4050             raise ExtractorError(u'No media url found.')
4051
4052         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4053         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4054         # choose first/default media type and highest quality for now
4055         for s in streams:        #find 300 - dsl1000mbit
4056             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4057                 stream_=s
4058                 break
4059         for s in streams:        #find veryhigh - dsl2000mbit
4060             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4061                 stream_=s
4062                 break
4063         if stream_ is None:
4064             raise ExtractorError(u'No stream found.')
4065
4066         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4067
4068         self.report_extraction(video_id)
4069         mobj = re.search(self._TITLE, html)
4070         if mobj is None:
4071             raise ExtractorError(u'Cannot extract title')
4072         title = unescapeHTML(mobj.group('title'))
4073
4074         mobj = re.search(self._MMS_STREAM, media_link)
4075         if mobj is None:
4076             mobj = re.search(self._RTSP_STREAM, media_link)
4077             if mobj is None:
4078                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4079         mms_url = mobj.group('video_url')
4080
4081         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4082         if mobj is None:
4083             raise ExtractorError(u'Cannot extract extention')
4084         ext = mobj.group('ext')
4085
4086         return [{'id': video_id,
4087                  'url': mms_url,
4088                  'title': title,
4089                  'ext': ext
4090                  }]
4091
4092 class TumblrIE(InfoExtractor):
4093     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4094
4095     def _real_extract(self, url):
4096         m_url = re.match(self._VALID_URL, url)
4097         video_id = m_url.group('id')
4098         blog = m_url.group('blog_name')
4099
4100         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4101         webpage = self._download_webpage(url, video_id)
4102
4103         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4104         video = re.search(re_video, webpage)
4105         if video is None:
4106            raise ExtractorError(u'Unable to extract video')
4107         video_url = video.group('video_url')
4108         ext = video.group('ext')
4109
4110         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4111             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4112         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4113
4114         # The only place where you can get a title, it's not complete,
4115         # but searching in other places doesn't work for all videos
4116         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4117             webpage, u'title', flags=re.DOTALL)
4118
4119         return [{'id': video_id,
4120                  'url': video_url,
4121                  'title': video_title,
4122                  'thumbnail': video_thumbnail,
4123                  'ext': ext
4124                  }]
4125
4126 class BandcampIE(InfoExtractor):
4127     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4128
4129     def _real_extract(self, url):
4130         mobj = re.match(self._VALID_URL, url)
4131         title = mobj.group('title')
4132         webpage = self._download_webpage(url, title)
4133         # We get the link to the free download page
4134         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4135         if m_download is None:
4136             raise ExtractorError(u'No free songs found')
4137
4138         download_link = m_download.group(1)
4139         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4140                        webpage, re.MULTILINE|re.DOTALL).group('id')
4141
4142         download_webpage = self._download_webpage(download_link, id,
4143                                                   'Downloading free downloads page')
4144         # We get the dictionary of the track from some javascrip code
4145         info = re.search(r'items: (.*?),$',
4146                          download_webpage, re.MULTILINE).group(1)
4147         info = json.loads(info)[0]
4148         # We pick mp3-320 for now, until format selection can be easily implemented.
4149         mp3_info = info[u'downloads'][u'mp3-320']
4150         # If we try to use this url it says the link has expired
4151         initial_url = mp3_info[u'url']
4152         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4153         m_url = re.match(re_url, initial_url)
4154         #We build the url we will use to get the final track url
4155         # This url is build in Bandcamp in the script download_bunde_*.js
4156         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4157         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4158         # If we could correctly generate the .rand field the url would be
4159         #in the "download_url" key
4160         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4161
4162         track_info = {'id':id,
4163                       'title' : info[u'title'],
4164                       'ext' :   'mp3',
4165                       'url' :   final_url,
4166                       'thumbnail' : info[u'thumb_url'],
4167                       'uploader' :  info[u'artist']
4168                       }
4169
4170         return [track_info]
4171
4172 class RedTubeIE(InfoExtractor):
4173     """Information Extractor for redtube"""
4174     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4175
4176     def _real_extract(self,url):
4177         mobj = re.match(self._VALID_URL, url)
4178         if mobj is None:
4179             raise ExtractorError(u'Invalid URL: %s' % url)
4180
4181         video_id = mobj.group('id')
4182         video_extension = 'mp4'
4183         webpage = self._download_webpage(url, video_id)
4184
4185         self.report_extraction(video_id)
4186
4187         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4188             webpage, u'video URL')
4189
4190         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4191             webpage, u'title')
4192
4193         return [{
4194             'id':       video_id,
4195             'url':      video_url,
4196             'ext':      video_extension,
4197             'title':    video_title,
4198         }]
4199
4200 class InaIE(InfoExtractor):
4201     """Information Extractor for Ina.fr"""
4202     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4203
4204     def _real_extract(self,url):
4205         mobj = re.match(self._VALID_URL, url)
4206
4207         video_id = mobj.group('id')
4208         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4209         video_extension = 'mp4'
4210         webpage = self._download_webpage(mrss_url, video_id)
4211
4212         self.report_extraction(video_id)
4213
4214         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4215             webpage, u'video URL')
4216
4217         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4218             webpage, u'title')
4219
4220         return [{
4221             'id':       video_id,
4222             'url':      video_url,
4223             'ext':      video_extension,
4224             'title':    video_title,
4225         }]
4226
4227 class HowcastIE(InfoExtractor):
4228     """Information Extractor for Howcast.com"""
4229     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4230
4231     def _real_extract(self, url):
4232         mobj = re.match(self._VALID_URL, url)
4233
4234         video_id = mobj.group('id')
4235         webpage_url = 'http://www.howcast.com/videos/' + video_id
4236         webpage = self._download_webpage(webpage_url, video_id)
4237
4238         self.report_extraction(video_id)
4239
4240         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4241             webpage, u'video URL')
4242
4243         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4244             webpage, u'title')
4245
4246         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4247             webpage, u'description', fatal=False)
4248
4249         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4250             webpage, u'thumbnail', fatal=False)
4251
4252         return [{
4253             'id':       video_id,
4254             'url':      video_url,
4255             'ext':      'mp4',
4256             'title':    video_title,
4257             'description': video_description,
4258             'thumbnail': thumbnail,
4259         }]
4260
4261 class VineIE(InfoExtractor):
4262     """Information Extractor for Vine.co"""
4263     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4264
4265     def _real_extract(self, url):
4266         mobj = re.match(self._VALID_URL, url)
4267
4268         video_id = mobj.group('id')
4269         webpage_url = 'https://vine.co/v/' + video_id
4270         webpage = self._download_webpage(webpage_url, video_id)
4271
4272         self.report_extraction(video_id)
4273
4274         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4275             webpage, u'video URL')
4276
4277         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4278             webpage, u'title')
4279
4280         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4281             webpage, u'thumbnail', fatal=False)
4282
4283         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4284             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4285
4286         return [{
4287             'id':        video_id,
4288             'url':       video_url,
4289             'ext':       'mp4',
4290             'title':     video_title,
4291             'thumbnail': thumbnail,
4292             'uploader':  uploader,
4293         }]
4294
4295 class FlickrIE(InfoExtractor):
4296     """Information Extractor for Flickr videos"""
4297     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4298
4299     def _real_extract(self, url):
4300         mobj = re.match(self._VALID_URL, url)
4301
4302         video_id = mobj.group('id')
4303         video_uploader_id = mobj.group('uploader_id')
4304         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4305         webpage = self._download_webpage(webpage_url, video_id)
4306
4307         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4308
4309         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4310         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4311
4312         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4313             first_xml, u'node_id')
4314
4315         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4316         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4317
4318         self.report_extraction(video_id)
4319
4320         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4321         if mobj is None:
4322             raise ExtractorError(u'Unable to extract video url')
4323         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4324
4325         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4326             webpage, u'video title')
4327
4328         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4329             webpage, u'description', fatal=False)
4330
4331         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4332             webpage, u'thumbnail', fatal=False)
4333
4334         return [{
4335             'id':          video_id,
4336             'url':         video_url,
4337             'ext':         'mp4',
4338             'title':       video_title,
4339             'description': video_description,
4340             'thumbnail':   thumbnail,
4341             'uploader_id': video_uploader_id,
4342         }]
4343
4344 class TeamcocoIE(InfoExtractor):
4345     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4346
4347     def _real_extract(self, url):
4348         mobj = re.match(self._VALID_URL, url)
4349         if mobj is None:
4350             raise ExtractorError(u'Invalid URL: %s' % url)
4351         url_title = mobj.group('url_title')
4352         webpage = self._download_webpage(url, url_title)
4353
4354         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4355             webpage, u'video id')
4356
4357         self.report_extraction(video_id)
4358
4359         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4360             webpage, u'title')
4361
4362         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4363             webpage, u'thumbnail', fatal=False)
4364
4365         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4366             webpage, u'description', fatal=False)
4367
4368         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4369         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4370
4371         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4372             data, u'video URL')
4373
4374         return [{
4375             'id':          video_id,
4376             'url':         video_url,
4377             'ext':         'mp4',
4378             'title':       video_title,
4379             'thumbnail':   thumbnail,
4380             'description': video_description,
4381         }]
4382
4383 class XHamsterIE(InfoExtractor):
4384     """Information Extractor for xHamster"""
4385     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4386
4387     def _real_extract(self,url):
4388         mobj = re.match(self._VALID_URL, url)
4389
4390         video_id = mobj.group('id')
4391         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4392         webpage = self._download_webpage(mrss_url, video_id)
4393
4394         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4395         if mobj is None:
4396             raise ExtractorError(u'Unable to extract media URL')
4397         if len(mobj.group('server')) == 0:
4398             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4399         else:
4400             video_url = mobj.group('server')+'/key='+mobj.group('file')
4401         video_extension = video_url.split('.')[-1]
4402
4403         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4404             webpage, u'title')
4405
4406         # Can't see the description anywhere in the UI
4407         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4408         #     webpage, u'description', fatal=False)
4409         # if video_description: video_description = unescapeHTML(video_description)
4410
4411         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4412         if mobj:
4413             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4414         else:
4415             video_upload_date = None
4416             self._downloader.report_warning(u'Unable to extract upload date')
4417
4418         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4419             webpage, u'uploader id', default=u'anonymous')
4420
4421         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4422             webpage, u'thumbnail', fatal=False)
4423
4424         return [{
4425             'id':       video_id,
4426             'url':      video_url,
4427             'ext':      video_extension,
4428             'title':    video_title,
4429             # 'description': video_description,
4430             'upload_date': video_upload_date,
4431             'uploader_id': video_uploader_id,
4432             'thumbnail': video_thumbnail
4433         }]
4434
4435 class HypemIE(InfoExtractor):
4436     """Information Extractor for hypem"""
4437     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4438
4439     def _real_extract(self, url):
4440         mobj = re.match(self._VALID_URL, url)
4441         if mobj is None:
4442             raise ExtractorError(u'Invalid URL: %s' % url)
4443         track_id = mobj.group(1)
4444
4445         data = { 'ax': 1, 'ts': time.time() }
4446         data_encoded = compat_urllib_parse.urlencode(data)
4447         complete_url = url + "?" + data_encoded
4448         request = compat_urllib_request.Request(complete_url)
4449         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4450         cookie = urlh.headers.get('Set-Cookie', '')
4451
4452         self.report_extraction(track_id)
4453
4454         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4455             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4456         try:
4457             track_list = json.loads(html_tracks)
4458             track = track_list[u'tracks'][0]
4459         except ValueError:
4460             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4461
4462         key = track[u"key"]
4463         track_id = track[u"id"]
4464         artist = track[u"artist"]
4465         title = track[u"song"]
4466
4467         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4468         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4469         request.add_header('cookie', cookie)
4470         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4471         try:
4472             song_data = json.loads(song_data_json)
4473         except ValueError:
4474             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4475         final_url = song_data[u"url"]
4476
4477         return [{
4478             'id':       track_id,
4479             'url':      final_url,
4480             'ext':      "mp3",
4481             'title':    title,
4482             'artist':   artist,
4483         }]
4484
4485 class Vbox7IE(InfoExtractor):
4486     """Information Extractor for Vbox7"""
4487     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4488
4489     def _real_extract(self,url):
4490         mobj = re.match(self._VALID_URL, url)
4491         if mobj is None:
4492             raise ExtractorError(u'Invalid URL: %s' % url)
4493         video_id = mobj.group(1)
4494
4495         redirect_page, urlh = self._download_webpage_handle(url, video_id)
4496         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4497         redirect_url = urlh.geturl() + new_location
4498         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4499
4500         title = self._html_search_regex(r'<title>(.*)</title>',
4501             webpage, u'title').split('/')[0].strip()
4502
4503         ext = "flv"
4504         info_url = "http://vbox7.com/play/magare.do"
4505         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4506         info_request = compat_urllib_request.Request(info_url, data)
4507         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4508         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4509         if info_response is None:
4510             raise ExtractorError(u'Unable to extract the media url')
4511         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4512
4513         return [{
4514             'id':        video_id,
4515             'url':       final_url,
4516             'ext':       ext,
4517             'title':     title,
4518             'thumbnail': thumbnail_url,
4519         }]
4520
4521 class GametrailersIE(InfoExtractor):
4522     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4523
4524     def _real_extract(self, url):
4525         mobj = re.match(self._VALID_URL, url)
4526         if mobj is None:
4527             raise ExtractorError(u'Invalid URL: %s' % url)
4528         video_id = mobj.group('id')
4529         video_type = mobj.group('type')
4530         webpage = self._download_webpage(url, video_id)
4531         if video_type == 'full-episodes':
4532             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4533         else:
4534             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4535         mgid = self._search_regex(mgid_re, webpage, u'mgid')
4536         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4537
4538         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4539                                            video_id, u'Downloading video info')
4540         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4541                                                video_id, u'Downloading video urls info')
4542
4543         self.report_extraction(video_id)
4544         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4545                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4546                       <image>.*
4547                         <url>(?P<thumb>.*?)</url>.*
4548                       </image>'''
4549
4550         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4551         if m_info is None:
4552             raise ExtractorError(u'Unable to extract video info')
4553         video_title = m_info.group('title')
4554         video_description = m_info.group('description')
4555         video_thumb = m_info.group('thumb')
4556
4557         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4558         if m_urls is None or len(m_urls) == 0:
4559             raise ExtractError(u'Unable to extrat video url')
4560         # They are sorted from worst to best quality
4561         video_url = m_urls[-1].group('url')
4562
4563         return {'url':         video_url,
4564                 'id':          video_id,
4565                 'title':       video_title,
4566                 # Videos are actually flv not mp4
4567                 'ext':         'flv',
4568                 'thumbnail':   video_thumb,
4569                 'description': video_description,
4570                 }
4571
4572 def gen_extractors():
4573     """ Return a list of an instance of every supported extractor.
4574     The order does matter; the first extractor matched is the one handling the URL.
4575     """
4576     return [
4577         YoutubePlaylistIE(),
4578         YoutubeChannelIE(),
4579         YoutubeUserIE(),
4580         YoutubeSearchIE(),
4581         YoutubeIE(),
4582         MetacafeIE(),
4583         DailymotionIE(),
4584         GoogleSearchIE(),
4585         PhotobucketIE(),
4586         YahooIE(),
4587         YahooSearchIE(),
4588         DepositFilesIE(),
4589         FacebookIE(),
4590         BlipTVIE(),
4591         BlipTVUserIE(),
4592         VimeoIE(),
4593         MyVideoIE(),
4594         ComedyCentralIE(),
4595         EscapistIE(),
4596         CollegeHumorIE(),
4597         XVideosIE(),
4598         SoundcloudSetIE(),
4599         SoundcloudIE(),
4600         InfoQIE(),
4601         MixcloudIE(),
4602         StanfordOpenClassroomIE(),
4603         MTVIE(),
4604         YoukuIE(),
4605         XNXXIE(),
4606         YouJizzIE(),
4607         PornotubeIE(),
4608         YouPornIE(),
4609         GooglePlusIE(),
4610         ArteTvIE(),
4611         NBAIE(),
4612         WorldStarHipHopIE(),
4613         JustinTVIE(),
4614         FunnyOrDieIE(),
4615         SteamIE(),
4616         UstreamIE(),
4617         RBMARadioIE(),
4618         EightTracksIE(),
4619         KeekIE(),
4620         TEDIE(),
4621         MySpassIE(),
4622         SpiegelIE(),
4623         LiveLeakIE(),
4624         ARDIE(),
4625         ZDFIE(),
4626         TumblrIE(),
4627         BandcampIE(),
4628         RedTubeIE(),
4629         InaIE(),
4630         HowcastIE(),
4631         VineIE(),
4632         FlickrIE(),
4633         TeamcocoIE(),
4634         XHamsterIE(),
4635         HypemIE(),
4636         Vbox7IE(),
4637         GametrailersIE(),
4638         GenericIE()
4639     ]
4640
4641 def get_info_extractor(ie_name):
4642     """Returns the info extractor class with the given ie_name"""
4643     return globals()[ie_name+'IE']