youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 195         """
 196         Perform a regex search on the given string, using a single or a list of
 197         patterns returning the first matching group.
 198         In case of failure return a default value or raise a WARNING or a
 199         ExtractorError, depending on fatal, specifying the field name.
 200         """
 201         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 202             mobj = re.search(pattern, string, flags)
 203         else:
 204             for p in pattern:
 205                 mobj = re.search(p, string, flags)
 206                 if mobj: break
 207
 208         if sys.stderr.isatty() and os.name != 'nt':
 209             _name = u'\033[0;34m%s\033[0m' % name
 210         else:
 211             _name = name
 212
 213         if mobj:
 214             # return the first matching group
 215             return next(g for g in mobj.groups() if g is not None)
 216         elif default is not None:
 217             return default
 218         elif fatal:
 219             raise ExtractorError(u'Unable to extract %s' % _name)
 220         else:
 221             self._downloader.report_warning(u'unable to extract %s; '
 222                 u'please report this issue on GitHub.' % _name)
 223             return None
 224
 225     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 226         """
 227         Like _search_regex, but strips HTML tags and unescapes entities.
 228         """
 229         res = self._search_regex(pattern, string, name, default, fatal, flags)
 230         if res:
 231             return clean_html(res).strip()
 232         else:
 233             return res
 234
 235 class SearchInfoExtractor(InfoExtractor):
 236     """
 237     Base class for paged search queries extractors.
 238     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 239     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 240     """
 241
 242     @classmethod
 243     def _make_valid_url(cls):
 244         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 245
 246     @classmethod
 247     def suitable(cls, url):
 248         return re.match(cls._make_valid_url(), url) is not None
 249
 250     def _real_extract(self, query):
 251         mobj = re.match(self._make_valid_url(), query)
 252         if mobj is None:
 253             raise ExtractorError(u'Invalid search query "%s"' % query)
 254
 255         prefix = mobj.group('prefix')
 256         query = mobj.group('query')
 257         if prefix == '':
 258             return self._get_n_results(query, 1)
 259         elif prefix == 'all':
 260             return self._get_n_results(query, self._MAX_RESULTS)
 261         else:
 262             n = int(prefix)
 263             if n <= 0:
 264                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 265             elif n > self._MAX_RESULTS:
 266                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 267                 n = self._MAX_RESULTS
 268             return self._get_n_results(query, n)
 269
 270     def _get_n_results(self, query, n):
 271         """Get a specified number of results for a query"""
 272         raise NotImplementedError("This method must be implemented by sublclasses")
 273
 274
 275 class YoutubeIE(InfoExtractor):
 276     """Information extractor for youtube.com."""
 277
 278     _VALID_URL = r"""^
 279                      (
 280                          (?:https?://)?                                       # http(s):// (optional)
 281                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 282                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 283                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 284                          (?:                                                  # the various things that can precede the ID:
 285                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 286                              |(?:                                             # or the v= param in all its forms
 287                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 288                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 289                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 290                                  v=
 291                              )
 292                          )?                                                   # optional -> youtube.com/xxxx is OK
 293                      )?                                                       # all until now is optional -> you can pass the naked ID
 294                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 295                      (?(1).+)?                                                # if we found the ID, everything can follow
 296                      $"""
 297     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 298     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 299     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 300     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 301     _NETRC_MACHINE = 'youtube'
 302     # Listed in order of quality
 303     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 304     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 305     _video_extensions = {
 306         '13': '3gp',
 307         '17': 'mp4',
 308         '18': 'mp4',
 309         '22': 'mp4',
 310         '37': 'mp4',
 311         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 312         '43': 'webm',
 313         '44': 'webm',
 314         '45': 'webm',
 315         '46': 'webm',
 316     }
 317     _video_dimensions = {
 318         '5': '240x400',
 319         '6': '???',
 320         '13': '???',
 321         '17': '144x176',
 322         '18': '360x640',
 323         '22': '720x1280',
 324         '34': '360x640',
 325         '35': '480x854',
 326         '37': '1080x1920',
 327         '38': '3072x4096',
 328         '43': '360x640',
 329         '44': '480x854',
 330         '45': '720x1280',
 331         '46': '1080x1920',
 332     }
 333     IE_NAME = u'youtube'
 334
 335     @classmethod
 336     def suitable(cls, url):
 337         """Receives a URL and returns True if suitable for this IE."""
 338         if YoutubePlaylistIE.suitable(url): return False
 339         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 340
 341     def report_lang(self):
 342         """Report attempt to set language."""
 343         self.to_screen(u'Setting language')
 344
 345     def report_login(self):
 346         """Report attempt to log in."""
 347         self.to_screen(u'Logging in')
 348
 349     def report_video_webpage_download(self, video_id):
 350         """Report attempt to download video webpage."""
 351         self.to_screen(u'%s: Downloading video webpage' % video_id)
 352
 353     def report_video_info_webpage_download(self, video_id):
 354         """Report attempt to download video info webpage."""
 355         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 356
 357     def report_video_subtitles_download(self, video_id):
 358         """Report attempt to download video info webpage."""
 359         self.to_screen(u'%s: Checking available subtitles' % video_id)
 360
 361     def report_video_subtitles_request(self, video_id, sub_lang, format):
 362         """Report attempt to download video info webpage."""
 363         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 364
 365     def report_video_subtitles_available(self, video_id, sub_lang_list):
 366         """Report available subtitles."""
 367         sub_lang = ",".join(list(sub_lang_list.keys()))
 368         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 369
 370     def report_information_extraction(self, video_id):
 371         """Report attempt to extract video information."""
 372         self.to_screen(u'%s: Extracting video information' % video_id)
 373
 374     def report_unavailable_format(self, video_id, format):
 375         """Report extracted video URL."""
 376         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 377
 378     def report_rtmp_download(self):
 379         """Indicate the download will use the RTMP protocol."""
 380         self.to_screen(u'RTMP download detected')
 381
 382     def _get_available_subtitles(self, video_id):
 383         self.report_video_subtitles_download(video_id)
 384         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 385         try:
 386             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 388             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 389         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 390         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 391         if not sub_lang_list:
 392             return (u'video doesn\'t have subtitles', None)
 393         return sub_lang_list
 394
 395     def _list_available_subtitles(self, video_id):
 396         sub_lang_list = self._get_available_subtitles(video_id)
 397         self.report_video_subtitles_available(video_id, sub_lang_list)
 398
 399     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 400         """
 401         Return tuple:
 402         (error_message, sub_lang, sub)
 403         """
 404         self.report_video_subtitles_request(video_id, sub_lang, format)
 405         params = compat_urllib_parse.urlencode({
 406             'lang': sub_lang,
 407             'name': sub_name,
 408             'v': video_id,
 409             'fmt': format,
 410         })
 411         url = 'http://www.youtube.com/api/timedtext?' + params
 412         try:
 413             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 416         if not sub:
 417             return (u'Did not fetch video subtitles', None, None)
 418         return (None, sub_lang, sub)
 419
 420     def _request_automatic_caption(self, video_id, webpage):
 421         """We need the webpage for getting the captions url, pass it as an
 422            argument to speed up the process."""
 423         sub_lang = self._downloader.params.get('subtitleslang')
 424         sub_format = self._downloader.params.get('subtitlesformat')
 425         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 426         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 427         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 428         if mobj is None:
 429             return [(err_msg, None, None)]
 430         player_config = json.loads(mobj.group(1))
 431         try:
 432             args = player_config[u'args']
 433             caption_url = args[u'ttsurl']
 434             timestamp = args[u'timestamp']
 435             params = compat_urllib_parse.urlencode({
 436                 'lang': 'en',
 437                 'tlang': sub_lang,
 438                 'fmt': sub_format,
 439                 'ts': timestamp,
 440                 'kind': 'asr',
 441             })
 442             subtitles_url = caption_url + '&' + params
 443             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 444             return [(None, sub_lang, sub)]
 445         except KeyError:
 446             return [(err_msg, None, None)]
 447
 448     def _extract_subtitle(self, video_id):
 449         """
 450         Return a list with a tuple:
 451         [(error_message, sub_lang, sub)]
 452         """
 453         sub_lang_list = self._get_available_subtitles(video_id)
 454         sub_format = self._downloader.params.get('subtitlesformat')
 455         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 456             return [(sub_lang_list[0], None, None)]
 457         if self._downloader.params.get('subtitleslang', False):
 458             sub_lang = self._downloader.params.get('subtitleslang')
 459         elif 'en' in sub_lang_list:
 460             sub_lang = 'en'
 461         else:
 462             sub_lang = list(sub_lang_list.keys())[0]
 463         if not sub_lang in sub_lang_list:
 464             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 465
 466         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 467         return [subtitle]
 468
 469     def _extract_all_subtitles(self, video_id):
 470         sub_lang_list = self._get_available_subtitles(video_id)
 471         sub_format = self._downloader.params.get('subtitlesformat')
 472         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 473             return [(sub_lang_list[0], None, None)]
 474         subtitles = []
 475         for sub_lang in sub_lang_list:
 476             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 477             subtitles.append(subtitle)
 478         return subtitles
 479
 480     def _print_formats(self, formats):
 481         print('Available formats:')
 482         for x in formats:
 483             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 484
 485     def _real_initialize(self):
 486         if self._downloader is None:
 487             return
 488
 489         username = None
 490         password = None
 491         downloader_params = self._downloader.params
 492
 493         # Attempt to use provided username and password or .netrc data
 494         if downloader_params.get('username', None) is not None:
 495             username = downloader_params['username']
 496             password = downloader_params['password']
 497         elif downloader_params.get('usenetrc', False):
 498             try:
 499                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 500                 if info is not None:
 501                     username = info[0]
 502                     password = info[2]
 503                 else:
 504                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 505             except (IOError, netrc.NetrcParseError) as err:
 506                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 507                 return
 508
 509         # Set language
 510         request = compat_urllib_request.Request(self._LANG_URL)
 511         try:
 512             self.report_lang()
 513             compat_urllib_request.urlopen(request).read()
 514         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 515             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 516             return
 517
 518         # No authentication to be performed
 519         if username is None:
 520             return
 521
 522         request = compat_urllib_request.Request(self._LOGIN_URL)
 523         try:
 524             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 525         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 526             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 527             return
 528
 529         galx = None
 530         dsh = None
 531         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 532         if match:
 533           galx = match.group(1)
 534
 535         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 536         if match:
 537           dsh = match.group(1)
 538
 539         # Log in
 540         login_form_strs = {
 541                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 542                 u'Email': username,
 543                 u'GALX': galx,
 544                 u'Passwd': password,
 545                 u'PersistentCookie': u'yes',
 546                 u'_utf8': u'霱',
 547                 u'bgresponse': u'js_disabled',
 548                 u'checkConnection': u'',
 549                 u'checkedDomains': u'youtube',
 550                 u'dnConn': u'',
 551                 u'dsh': dsh,
 552                 u'pstMsg': u'0',
 553                 u'rmShown': u'1',
 554                 u'secTok': u'',
 555                 u'signIn': u'Sign in',
 556                 u'timeStmp': u'',
 557                 u'service': u'youtube',
 558                 u'uilel': u'3',
 559                 u'hl': u'en_US',
 560         }
 561         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 562         # chokes on unicode
 563         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 564         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 565         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 566         try:
 567             self.report_login()
 568             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 569             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 570                 self._downloader.report_warning(u'unable to log in: bad username or password')
 571                 return
 572         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 573             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 574             return
 575
 576         # Confirm age
 577         age_form = {
 578                 'next_url':     '/',
 579                 'action_confirm':   'Confirm',
 580                 }
 581         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 582         try:
 583             self.report_age_confirmation()
 584             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 585         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 586             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 587
 588     def _extract_id(self, url):
 589         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 590         if mobj is None:
 591             raise ExtractorError(u'Invalid URL: %s' % url)
 592         video_id = mobj.group(2)
 593         return video_id
 594
 595     def _real_extract(self, url):
 596         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 597         mobj = re.search(self._NEXT_URL_RE, url)
 598         if mobj:
 599             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 600         video_id = self._extract_id(url)
 601
 602         # Get video webpage
 603         self.report_video_webpage_download(video_id)
 604         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 605         request = compat_urllib_request.Request(url)
 606         try:
 607             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 608         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 609             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 610
 611         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 612
 613         # Attempt to extract SWF player URL
 614         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 615         if mobj is not None:
 616             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 617         else:
 618             player_url = None
 619
 620         # Get video info
 621         self.report_video_info_webpage_download(video_id)
 622         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 623             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 624                     % (video_id, el_type))
 625             video_info_webpage = self._download_webpage(video_info_url, video_id,
 626                                     note=False,
 627                                     errnote='unable to download video info webpage')
 628             video_info = compat_parse_qs(video_info_webpage)
 629             if 'token' in video_info:
 630                 break
 631         if 'token' not in video_info:
 632             if 'reason' in video_info:
 633                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 634             else:
 635                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 636
 637         # Check for "rental" videos
 638         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 639             raise ExtractorError(u'"rental" videos not supported')
 640
 641         # Start extracting information
 642         self.report_information_extraction(video_id)
 643
 644         # uploader
 645         if 'author' not in video_info:
 646             raise ExtractorError(u'Unable to extract uploader name')
 647         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 648
 649         # uploader_id
 650         video_uploader_id = None
 651         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 652         if mobj is not None:
 653             video_uploader_id = mobj.group(1)
 654         else:
 655             self._downloader.report_warning(u'unable to extract uploader nickname')
 656
 657         # title
 658         if 'title' not in video_info:
 659             raise ExtractorError(u'Unable to extract video title')
 660         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 661
 662         # thumbnail image
 663         if 'thumbnail_url' not in video_info:
 664             self._downloader.report_warning(u'unable to extract video thumbnail')
 665             video_thumbnail = ''
 666         else:   # don't panic if we can't find it
 667             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 668
 669         # upload date
 670         upload_date = None
 671         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 672         if mobj is not None:
 673             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 674             upload_date = unified_strdate(upload_date)
 675
 676         # description
 677         video_description = get_element_by_id("eow-description", video_webpage)
 678         if video_description:
 679             video_description = clean_html(video_description)
 680         else:
 681             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 682             if fd_mobj:
 683                 video_description = unescapeHTML(fd_mobj.group(1))
 684             else:
 685                 video_description = u''
 686
 687         # subtitles
 688         video_subtitles = None
 689
 690         if self._downloader.params.get('writesubtitles', False):
 691             video_subtitles = self._extract_subtitle(video_id)
 692             if video_subtitles:
 693                 (sub_error, sub_lang, sub) = video_subtitles[0]
 694                 if sub_error:
 695                     # We try with the automatic captions
 696                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 697                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 698                     if sub is not None:
 699                         pass
 700                     else:
 701                         # We report the original error
 702                         self._downloader.report_error(sub_error)
 703
 704         if self._downloader.params.get('allsubtitles', False):
 705             video_subtitles = self._extract_all_subtitles(video_id)
 706             for video_subtitle in video_subtitles:
 707                 (sub_error, sub_lang, sub) = video_subtitle
 708                 if sub_error:
 709                     self._downloader.report_error(sub_error)
 710
 711         if self._downloader.params.get('listsubtitles', False):
 712             sub_lang_list = self._list_available_subtitles(video_id)
 713             return
 714
 715         if 'length_seconds' not in video_info:
 716             self._downloader.report_warning(u'unable to extract video duration')
 717             video_duration = ''
 718         else:
 719             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 720
 721         # token
 722         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 723
 724         # Decide which formats to download
 725         req_format = self._downloader.params.get('format', None)
 726
 727         try:
 728             mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
 729             info = json.loads(mobj.group(1))
 730             args = info['args']
 731             if args.get('ptk','') == 'vevo' or 'dashmpd':
 732                 # Vevo videos with encrypted signatures
 733                 self.to_screen(u'Vevo video detected.')
 734                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
 735         except ValueError:
 736             pass
 737
 738         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 739             self.report_rtmp_download()
 740             video_url_list = [(None, video_info['conn'][0])]
 741         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 742             url_map = {}
 743             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 744                 url_data = compat_parse_qs(url_data_str)
 745                 if 'itag' in url_data and 'url' in url_data:
 746                     url = url_data['url'][0]
 747                     if 'sig' in url_data:
 748                         url += '&signature=' + url_data['sig'][0]
 749                     if 's' in url_data:
 750                         def k(s):
 751                             """Decrypt the key the two subkeys must have a length of 43"""
 752                             (a,b) = s.split('.')
 753                             b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40]
 754                             a = a[-40:]
 755                             s_dec = '.'.join((a,b))[::-1]
 756                             return s_dec
 757                         key = k(url_data['s'][0])
 758                         url += '&signature=' + key
 759                     if 'ratebypass' not in url:
 760                         url += '&ratebypass=yes'
 761                     url_map[url_data['itag'][0]] = url
 762
 763             format_limit = self._downloader.params.get('format_limit', None)
 764             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 765             if format_limit is not None and format_limit in available_formats:
 766                 format_list = available_formats[available_formats.index(format_limit):]
 767             else:
 768                 format_list = available_formats
 769             existing_formats = [x for x in format_list if x in url_map]
 770             if len(existing_formats) == 0:
 771                 raise ExtractorError(u'no known formats available for video')
 772             if self._downloader.params.get('listformats', None):
 773                 self._print_formats(existing_formats)
 774                 return
 775             if req_format is None or req_format == 'best':
 776                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 777             elif req_format == 'worst':
 778                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 779             elif req_format in ('-1', 'all'):
 780                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 781             else:
 782                 # Specific formats. We pick the first in a slash-delimeted sequence.
 783                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 784                 req_formats = req_format.split('/')
 785                 video_url_list = None
 786                 for rf in req_formats:
 787                     if rf in url_map:
 788                         video_url_list = [(rf, url_map[rf])]
 789                         break
 790                 if video_url_list is None:
 791                     raise ExtractorError(u'requested format not available')
 792         else:
 793             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 794
 795         results = []
 796         for format_param, video_real_url in video_url_list:
 797             # Extension
 798             video_extension = self._video_extensions.get(format_param, 'flv')
 799
 800             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 801                                               self._video_dimensions.get(format_param, '???'))
 802
 803             results.append({
 804                 'id':       video_id,
 805                 'url':      video_real_url,
 806                 'uploader': video_uploader,
 807                 'uploader_id': video_uploader_id,
 808                 'upload_date':  upload_date,
 809                 'title':    video_title,
 810                 'ext':      video_extension,
 811                 'format':   video_format,
 812                 'thumbnail':    video_thumbnail,
 813                 'description':  video_description,
 814                 'player_url':   player_url,
 815                 'subtitles':    video_subtitles,
 816                 'duration':     video_duration
 817             })
 818         return results
 819
 820
 821 class MetacafeIE(InfoExtractor):
 822     """Information Extractor for metacafe.com."""
 823
 824     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 825     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 826     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 827     IE_NAME = u'metacafe'
 828
 829     def report_disclaimer(self):
 830         """Report disclaimer retrieval."""
 831         self.to_screen(u'Retrieving disclaimer')
 832
 833     def _real_initialize(self):
 834         # Retrieve disclaimer
 835         request = compat_urllib_request.Request(self._DISCLAIMER)
 836         try:
 837             self.report_disclaimer()
 838             disclaimer = compat_urllib_request.urlopen(request).read()
 839         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 840             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 841
 842         # Confirm age
 843         disclaimer_form = {
 844             'filters': '0',
 845             'submit': "Continue - I'm over 18",
 846             }
 847         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 848         try:
 849             self.report_age_confirmation()
 850             disclaimer = compat_urllib_request.urlopen(request).read()
 851         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 852             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 853
 854     def _real_extract(self, url):
 855         # Extract id and simplified title from URL
 856         mobj = re.match(self._VALID_URL, url)
 857         if mobj is None:
 858             raise ExtractorError(u'Invalid URL: %s' % url)
 859
 860         video_id = mobj.group(1)
 861
 862         # Check if video comes from YouTube
 863         mobj2 = re.match(r'^yt-(.*)$', video_id)
 864         if mobj2 is not None:
 865             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 866
 867         # Retrieve video webpage to extract further information
 868         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 869
 870         # Extract URL, uploader and title from webpage
 871         self.report_extraction(video_id)
 872         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 873         if mobj is not None:
 874             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 875             video_extension = mediaURL[-3:]
 876
 877             # Extract gdaKey if available
 878             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 879             if mobj is None:
 880                 video_url = mediaURL
 881             else:
 882                 gdaKey = mobj.group(1)
 883                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 884         else:
 885             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 886             if mobj is None:
 887                 raise ExtractorError(u'Unable to extract media URL')
 888             vardict = compat_parse_qs(mobj.group(1))
 889             if 'mediaData' not in vardict:
 890                 raise ExtractorError(u'Unable to extract media URL')
 891             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 892             if mobj is None:
 893                 raise ExtractorError(u'Unable to extract media URL')
 894             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 895             video_extension = mediaURL[-3:]
 896             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 897
 898         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 899         if mobj is None:
 900             raise ExtractorError(u'Unable to extract title')
 901         video_title = mobj.group(1).decode('utf-8')
 902
 903         mobj = re.search(r'submitter=(.*?);', webpage)
 904         if mobj is None:
 905             raise ExtractorError(u'Unable to extract uploader nickname')
 906         video_uploader = mobj.group(1)
 907
 908         return [{
 909             'id':       video_id.decode('utf-8'),
 910             'url':      video_url.decode('utf-8'),
 911             'uploader': video_uploader.decode('utf-8'),
 912             'upload_date':  None,
 913             'title':    video_title,
 914             'ext':      video_extension.decode('utf-8'),
 915         }]
 916
 917 class DailymotionIE(InfoExtractor):
 918     """Information Extractor for Dailymotion"""
 919
 920     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 921     IE_NAME = u'dailymotion'
 922
 923     def _real_extract(self, url):
 924         # Extract id and simplified title from URL
 925         mobj = re.match(self._VALID_URL, url)
 926         if mobj is None:
 927             raise ExtractorError(u'Invalid URL: %s' % url)
 928
 929         video_id = mobj.group(1).split('_')[0].split('?')[0]
 930
 931         video_extension = 'mp4'
 932
 933         # Retrieve video webpage to extract further information
 934         request = compat_urllib_request.Request(url)
 935         request.add_header('Cookie', 'family_filter=off')
 936         webpage = self._download_webpage(request, video_id)
 937
 938         # Extract URL, uploader and title from webpage
 939         self.report_extraction(video_id)
 940         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 941         if mobj is None:
 942             raise ExtractorError(u'Unable to extract media URL')
 943         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 944
 945         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 946             if key in flashvars:
 947                 max_quality = key
 948                 self.to_screen(u'Using %s' % key)
 949                 break
 950         else:
 951             raise ExtractorError(u'Unable to extract video URL')
 952
 953         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 954         if mobj is None:
 955             raise ExtractorError(u'Unable to extract video URL')
 956
 957         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 958
 959         # TODO: support choosing qualities
 960
 961         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 962         if mobj is None:
 963             raise ExtractorError(u'Unable to extract title')
 964         video_title = unescapeHTML(mobj.group('title'))
 965
 966         video_uploader = None
 967         video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
 968                                              # Looking for official user
 969                                              r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
 970                                             webpage, 'video uploader')
 971
 972         video_upload_date = None
 973         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 974         if mobj is not None:
 975             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 976
 977         return [{
 978             'id':       video_id,
 979             'url':      video_url,
 980             'uploader': video_uploader,
 981             'upload_date':  video_upload_date,
 982             'title':    video_title,
 983             'ext':      video_extension,
 984         }]
 985
 986
 987 class PhotobucketIE(InfoExtractor):
 988     """Information extractor for photobucket.com."""
 989
 990     # TODO: the original _VALID_URL was:
 991     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 992     # Check if it's necessary to keep the old extracion process
 993     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 994     IE_NAME = u'photobucket'
 995
 996     def _real_extract(self, url):
 997         # Extract id from URL
 998         mobj = re.match(self._VALID_URL, url)
 999         if mobj is None:
1000             raise ExtractorError(u'Invalid URL: %s' % url)
1001
1002         video_id = mobj.group('id')
1003
1004         video_extension = mobj.group('ext')
1005
1006         # Retrieve video webpage to extract further information
1007         webpage = self._download_webpage(url, video_id)
1008
1009         # Extract URL, uploader, and title from webpage
1010         self.report_extraction(video_id)
1011         # We try first by looking the javascript code:
1012         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
1013         if mobj is not None:
1014             info = json.loads(mobj.group('json'))
1015             return [{
1016                 'id':       video_id,
1017                 'url':      info[u'downloadUrl'],
1018                 'uploader': info[u'username'],
1019                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
1020                 'title':    info[u'title'],
1021                 'ext':      video_extension,
1022                 'thumbnail': info[u'thumbUrl'],
1023             }]
1024
1025         # We try looking in other parts of the webpage
1026         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1027             webpage, u'video URL')
1028
1029         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1030         if mobj is None:
1031             raise ExtractorError(u'Unable to extract title')
1032         video_title = mobj.group(1).decode('utf-8')
1033         video_uploader = mobj.group(2).decode('utf-8')
1034
1035         return [{
1036             'id':       video_id.decode('utf-8'),
1037             'url':      video_url.decode('utf-8'),
1038             'uploader': video_uploader,
1039             'upload_date':  None,
1040             'title':    video_title,
1041             'ext':      video_extension.decode('utf-8'),
1042         }]
1043
1044
1045 class YahooIE(InfoExtractor):
1046     """Information extractor for screen.yahoo.com."""
1047     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1048
1049     def _real_extract(self, url):
1050         mobj = re.match(self._VALID_URL, url)
1051         if mobj is None:
1052             raise ExtractorError(u'Invalid URL: %s' % url)
1053         video_id = mobj.group('id')
1054         webpage = self._download_webpage(url, video_id)
1055         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1056
1057         if m_id is None:
1058             # TODO: Check which url parameters are required
1059             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1060             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1061             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1062                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1063                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1064                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1065                         '''
1066             self.report_extraction(video_id)
1067             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1068             if m_info is None:
1069                 raise ExtractorError(u'Unable to extract video info')
1070             video_title = m_info.group('title')
1071             video_description = m_info.group('description')
1072             video_thumb = m_info.group('thumb')
1073             video_date = m_info.group('date')
1074             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1075
1076             # TODO: Find a way to get mp4 videos
1077             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1078             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1079             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1080             video_url = m_rest.group('url')
1081             video_path = m_rest.group('path')
1082             if m_rest is None:
1083                 raise ExtractorError(u'Unable to extract video url')
1084
1085         else: # We have to use a different method if another id is defined
1086             long_id = m_id.group('new_id')
1087             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1088             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1089             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1090             info = json.loads(json_str)
1091             res = info[u'query'][u'results'][u'mediaObj'][0]
1092             stream = res[u'streams'][0]
1093             video_path = stream[u'path']
1094             video_url = stream[u'host']
1095             meta = res[u'meta']
1096             video_title = meta[u'title']
1097             video_description = meta[u'description']
1098             video_thumb = meta[u'thumbnail']
1099             video_date = None # I can't find it
1100
1101         info_dict = {
1102                      'id': video_id,
1103                      'url': video_url,
1104                      'play_path': video_path,
1105                      'title':video_title,
1106                      'description': video_description,
1107                      'thumbnail': video_thumb,
1108                      'upload_date': video_date,
1109                      'ext': 'flv',
1110                      }
1111         return info_dict
1112
1113 class VimeoIE(InfoExtractor):
1114     """Information extractor for vimeo.com."""
1115
1116     # _VALID_URL matches Vimeo URLs
1117     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1118     IE_NAME = u'vimeo'
1119
1120     def _real_extract(self, url, new_video=True):
1121         # Extract ID from URL
1122         mobj = re.match(self._VALID_URL, url)
1123         if mobj is None:
1124             raise ExtractorError(u'Invalid URL: %s' % url)
1125
1126         video_id = mobj.group('id')
1127         if not mobj.group('proto'):
1128             url = 'https://' + url
1129         if mobj.group('direct_link') or mobj.group('pro'):
1130             url = 'https://vimeo.com/' + video_id
1131
1132         # Retrieve video webpage to extract further information
1133         request = compat_urllib_request.Request(url, None, std_headers)
1134         webpage = self._download_webpage(request, video_id)
1135
1136         # Now we begin extracting as much information as we can from what we
1137         # retrieved. First we extract the information common to all extractors,
1138         # and latter we extract those that are Vimeo specific.
1139         self.report_extraction(video_id)
1140
1141         # Extract the config JSON
1142         try:
1143             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1144             config = json.loads(config)
1145         except:
1146             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1147                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1148             else:
1149                 raise ExtractorError(u'Unable to extract info section')
1150
1151         # Extract title
1152         video_title = config["video"]["title"]
1153
1154         # Extract uploader and uploader_id
1155         video_uploader = config["video"]["owner"]["name"]
1156         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1157
1158         # Extract video thumbnail
1159         video_thumbnail = config["video"]["thumbnail"]
1160
1161         # Extract video description
1162         video_description = get_element_by_attribute("itemprop", "description", webpage)
1163         if video_description: video_description = clean_html(video_description)
1164         else: video_description = u''
1165
1166         # Extract upload date
1167         video_upload_date = None
1168         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1169         if mobj is not None:
1170             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1171
1172         # Vimeo specific: extract request signature and timestamp
1173         sig = config['request']['signature']
1174         timestamp = config['request']['timestamp']
1175
1176         # Vimeo specific: extract video codec and quality information
1177         # First consider quality, then codecs, then take everything
1178         # TODO bind to format param
1179         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1180         files = { 'hd': [], 'sd': [], 'other': []}
1181         for codec_name, codec_extension in codecs:
1182             if codec_name in config["video"]["files"]:
1183                 if 'hd' in config["video"]["files"][codec_name]:
1184                     files['hd'].append((codec_name, codec_extension, 'hd'))
1185                 elif 'sd' in config["video"]["files"][codec_name]:
1186                     files['sd'].append((codec_name, codec_extension, 'sd'))
1187                 else:
1188                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1189
1190         for quality in ('hd', 'sd', 'other'):
1191             if len(files[quality]) > 0:
1192                 video_quality = files[quality][0][2]
1193                 video_codec = files[quality][0][0]
1194                 video_extension = files[quality][0][1]
1195                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1196                 break
1197         else:
1198             raise ExtractorError(u'No known codec found')
1199
1200         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1201                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1202
1203         return [{
1204             'id':       video_id,
1205             'url':      video_url,
1206             'uploader': video_uploader,
1207             'uploader_id': video_uploader_id,
1208             'upload_date':  video_upload_date,
1209             'title':    video_title,
1210             'ext':      video_extension,
1211             'thumbnail':    video_thumbnail,
1212             'description':  video_description,
1213         }]
1214
1215
1216 class ArteTvIE(InfoExtractor):
1217     """arte.tv information extractor."""
1218
1219     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1220     _LIVE_URL = r'index-[0-9]+\.html$'
1221
1222     IE_NAME = u'arte.tv'
1223
1224     def fetch_webpage(self, url):
1225         request = compat_urllib_request.Request(url)
1226         try:
1227             self.report_download_webpage(url)
1228             webpage = compat_urllib_request.urlopen(request).read()
1229         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1230             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1231         except ValueError as err:
1232             raise ExtractorError(u'Invalid URL: %s' % url)
1233         return webpage
1234
1235     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1236         page = self.fetch_webpage(url)
1237         mobj = re.search(regex, page, regexFlags)
1238         info = {}
1239
1240         if mobj is None:
1241             raise ExtractorError(u'Invalid URL: %s' % url)
1242
1243         for (i, key, err) in matchTuples:
1244             if mobj.group(i) is None:
1245                 raise ExtractorError(err)
1246             else:
1247                 info[key] = mobj.group(i)
1248
1249         return info
1250
1251     def extractLiveStream(self, url):
1252         video_lang = url.split('/')[-4]
1253         info = self.grep_webpage(
1254             url,
1255             r'src="(.*?/videothek_js.*?\.js)',
1256             0,
1257             [
1258                 (1, 'url', u'Invalid URL: %s' % url)
1259             ]
1260         )
1261         http_host = url.split('/')[2]
1262         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1263         info = self.grep_webpage(
1264             next_url,
1265             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1266                 '(http://.*?\.swf).*?' +
1267                 '(rtmp://.*?)\'',
1268             re.DOTALL,
1269             [
1270                 (1, 'path',   u'could not extract video path: %s' % url),
1271                 (2, 'player', u'could not extract video player: %s' % url),
1272                 (3, 'url',    u'could not extract video url: %s' % url)
1273             ]
1274         )
1275         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1276
1277     def extractPlus7Stream(self, url):
1278         video_lang = url.split('/')[-3]
1279         info = self.grep_webpage(
1280             url,
1281             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1282             0,
1283             [
1284                 (1, 'url', u'Invalid URL: %s' % url)
1285             ]
1286         )
1287         next_url = compat_urllib_parse.unquote(info.get('url'))
1288         info = self.grep_webpage(
1289             next_url,
1290             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1291             0,
1292             [
1293                 (1, 'url', u'Could not find <video> tag: %s' % url)
1294             ]
1295         )
1296         next_url = compat_urllib_parse.unquote(info.get('url'))
1297
1298         info = self.grep_webpage(
1299             next_url,
1300             r'<video id="(.*?)".*?>.*?' +
1301                 '<name>(.*?)</name>.*?' +
1302                 '<dateVideo>(.*?)</dateVideo>.*?' +
1303                 '<url quality="hd">(.*?)</url>',
1304             re.DOTALL,
1305             [
1306                 (1, 'id',    u'could not extract video id: %s' % url),
1307                 (2, 'title', u'could not extract video title: %s' % url),
1308                 (3, 'date',  u'could not extract video date: %s' % url),
1309                 (4, 'url',   u'could not extract video url: %s' % url)
1310             ]
1311         )
1312
1313         return {
1314             'id':           info.get('id'),
1315             'url':          compat_urllib_parse.unquote(info.get('url')),
1316             'uploader':     u'arte.tv',
1317             'upload_date':  unified_strdate(info.get('date')),
1318             'title':        info.get('title').decode('utf-8'),
1319             'ext':          u'mp4',
1320             'format':       u'NA',
1321             'player_url':   None,
1322         }
1323
1324     def _real_extract(self, url):
1325         video_id = url.split('/')[-1]
1326         self.report_extraction(video_id)
1327
1328         if re.search(self._LIVE_URL, video_id) is not None:
1329             self.extractLiveStream(url)
1330             return
1331         else:
1332             info = self.extractPlus7Stream(url)
1333
1334         return [info]
1335
1336
1337 class GenericIE(InfoExtractor):
1338     """Generic last-resort information extractor."""
1339
1340     _VALID_URL = r'.*'
1341     IE_NAME = u'generic'
1342
1343     def report_download_webpage(self, video_id):
1344         """Report webpage download."""
1345         if not self._downloader.params.get('test', False):
1346             self._downloader.report_warning(u'Falling back on generic information extractor.')
1347         super(GenericIE, self).report_download_webpage(video_id)
1348
1349     def report_following_redirect(self, new_url):
1350         """Report information extraction."""
1351         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1352
1353     def _test_redirect(self, url):
1354         """Check if it is a redirect, like url shorteners, in case return the new url."""
1355         class HeadRequest(compat_urllib_request.Request):
1356             def get_method(self):
1357                 return "HEAD"
1358
1359         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1360             """
1361             Subclass the HTTPRedirectHandler to make it use our
1362             HeadRequest also on the redirected URL
1363             """
1364             def redirect_request(self, req, fp, code, msg, headers, newurl):
1365                 if code in (301, 302, 303, 307):
1366                     newurl = newurl.replace(' ', '%20')
1367                     newheaders = dict((k,v) for k,v in req.headers.items()
1368                                       if k.lower() not in ("content-length", "content-type"))
1369                     return HeadRequest(newurl,
1370                                        headers=newheaders,
1371                                        origin_req_host=req.get_origin_req_host(),
1372                                        unverifiable=True)
1373                 else:
1374                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1375
1376         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1377             """
1378             Fallback to GET if HEAD is not allowed (405 HTTP error)
1379             """
1380             def http_error_405(self, req, fp, code, msg, headers):
1381                 fp.read()
1382                 fp.close()
1383
1384                 newheaders = dict((k,v) for k,v in req.headers.items()
1385                                   if k.lower() not in ("content-length", "content-type"))
1386                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1387                                                  headers=newheaders,
1388                                                  origin_req_host=req.get_origin_req_host(),
1389                                                  unverifiable=True))
1390
1391         # Build our opener
1392         opener = compat_urllib_request.OpenerDirector()
1393         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1394                         HTTPMethodFallback, HEADRedirectHandler,
1395                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1396             opener.add_handler(handler())
1397
1398         response = opener.open(HeadRequest(url))
1399         if response is None:
1400             raise ExtractorError(u'Invalid URL protocol')
1401         new_url = response.geturl()
1402
1403         if url == new_url:
1404             return False
1405
1406         self.report_following_redirect(new_url)
1407         return new_url
1408
1409     def _real_extract(self, url):
1410         new_url = self._test_redirect(url)
1411         if new_url: return [self.url_result(new_url)]
1412
1413         video_id = url.split('/')[-1]
1414         try:
1415             webpage = self._download_webpage(url, video_id)
1416         except ValueError as err:
1417             # since this is the last-resort InfoExtractor, if
1418             # this error is thrown, it'll be thrown here
1419             raise ExtractorError(u'Invalid URL: %s' % url)
1420
1421         self.report_extraction(video_id)
1422         # Start with something easy: JW Player in SWFObject
1423         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1424         if mobj is None:
1425             # Broaden the search a little bit
1426             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1427         if mobj is None:
1428             # Broaden the search a little bit: JWPlayer JS loader
1429             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1430         if mobj is None:
1431             # Try to find twitter cards info
1432             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1433         if mobj is None:
1434             raise ExtractorError(u'Invalid URL: %s' % url)
1435
1436         # It's possible that one of the regexes
1437         # matched, but returned an empty group:
1438         if mobj.group(1) is None:
1439             raise ExtractorError(u'Invalid URL: %s' % url)
1440
1441         video_url = compat_urllib_parse.unquote(mobj.group(1))
1442         video_id = os.path.basename(video_url)
1443
1444         # here's a fun little line of code for you:
1445         video_extension = os.path.splitext(video_id)[1][1:]
1446         video_id = os.path.splitext(video_id)[0]
1447
1448         # it's tempting to parse this further, but you would
1449         # have to take into account all the variations like
1450         #   Video Title - Site Name
1451         #   Site Name | Video Title
1452         #   Video Title - Tagline | Site Name
1453         # and so on and so forth; it's just not practical
1454         video_title = self._html_search_regex(r'<title>(.*)</title>',
1455             webpage, u'video title')
1456
1457         # video uploader is domain name
1458         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1459             url, u'video uploader')
1460
1461         return [{
1462             'id':       video_id,
1463             'url':      video_url,
1464             'uploader': video_uploader,
1465             'upload_date':  None,
1466             'title':    video_title,
1467             'ext':      video_extension,
1468         }]
1469
1470
1471 class YoutubeSearchIE(SearchInfoExtractor):
1472     """Information Extractor for YouTube search queries."""
1473     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1474     _MAX_RESULTS = 1000
1475     IE_NAME = u'youtube:search'
1476     _SEARCH_KEY = 'ytsearch'
1477
1478     def report_download_page(self, query, pagenum):
1479         """Report attempt to download search page with given number."""
1480         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1481
1482     def _get_n_results(self, query, n):
1483         """Get a specified number of results for a query"""
1484
1485         video_ids = []
1486         pagenum = 0
1487         limit = n
1488
1489         while (50 * pagenum) < limit:
1490             self.report_download_page(query, pagenum+1)
1491             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1492             request = compat_urllib_request.Request(result_url)
1493             try:
1494                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1495             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1496                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1497             api_response = json.loads(data)['data']
1498
1499             if not 'items' in api_response:
1500                 raise ExtractorError(u'[youtube] No video results')
1501
1502             new_ids = list(video['id'] for video in api_response['items'])
1503             video_ids += new_ids
1504
1505             limit = min(n, api_response['totalItems'])
1506             pagenum += 1
1507
1508         if len(video_ids) > n:
1509             video_ids = video_ids[:n]
1510         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1511         return self.playlist_result(videos, query)
1512
1513
1514 class GoogleSearchIE(SearchInfoExtractor):
1515     """Information Extractor for Google Video search queries."""
1516     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1517     _MAX_RESULTS = 1000
1518     IE_NAME = u'video.google:search'
1519     _SEARCH_KEY = 'gvsearch'
1520
1521     def _get_n_results(self, query, n):
1522         """Get a specified number of results for a query"""
1523
1524         res = {
1525             '_type': 'playlist',
1526             'id': query,
1527             'entries': []
1528         }
1529
1530         for pagenum in itertools.count(1):
1531             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1532             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1533                                              note='Downloading result page ' + str(pagenum))
1534
1535             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1536                 e = {
1537                     '_type': 'url',
1538                     'url': mobj.group(1)
1539                 }
1540                 res['entries'].append(e)
1541
1542             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1543                 return res
1544
1545 class YahooSearchIE(SearchInfoExtractor):
1546     """Information Extractor for Yahoo! Video search queries."""
1547
1548     _MAX_RESULTS = 1000
1549     IE_NAME = u'screen.yahoo:search'
1550     _SEARCH_KEY = 'yvsearch'
1551
1552     def _get_n_results(self, query, n):
1553         """Get a specified number of results for a query"""
1554
1555         res = {
1556             '_type': 'playlist',
1557             'id': query,
1558             'entries': []
1559         }
1560         for pagenum in itertools.count(0):
1561             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1562             webpage = self._download_webpage(result_url, query,
1563                                              note='Downloading results page '+str(pagenum+1))
1564             info = json.loads(webpage)
1565             m = info[u'm']
1566             results = info[u'results']
1567
1568             for (i, r) in enumerate(results):
1569                 if (pagenum * 30) +i >= n:
1570                     break
1571                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1572                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1573                 res['entries'].append(e)
1574             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1575                 break
1576
1577         return res
1578
1579
1580 class YoutubePlaylistIE(InfoExtractor):
1581     """Information Extractor for YouTube playlists."""
1582
1583     _VALID_URL = r"""(?:
1584                         (?:https?://)?
1585                         (?:\w+\.)?
1586                         youtube\.com/
1587                         (?:
1588                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1589                            \? (?:.*?&)*? (?:p|a|list)=
1590                         |  p/
1591                         )
1592                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1593                         .*
1594                      |
1595                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1596                      )"""
1597     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1598     _MAX_RESULTS = 50
1599     IE_NAME = u'youtube:playlist'
1600
1601     @classmethod
1602     def suitable(cls, url):
1603         """Receives a URL and returns True if suitable for this IE."""
1604         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1605
1606     def _real_extract(self, url):
1607         # Extract playlist id
1608         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1609         if mobj is None:
1610             raise ExtractorError(u'Invalid URL: %s' % url)
1611
1612         # Download playlist videos from API
1613         playlist_id = mobj.group(1) or mobj.group(2)
1614         page_num = 1
1615         videos = []
1616
1617         while True:
1618             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1619             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1620
1621             try:
1622                 response = json.loads(page)
1623             except ValueError as err:
1624                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1625
1626             if 'feed' not in response:
1627                 raise ExtractorError(u'Got a malformed response from YouTube API')
1628             playlist_title = response['feed']['title']['$t']
1629             if 'entry' not in response['feed']:
1630                 # Number of videos is a multiple of self._MAX_RESULTS
1631                 break
1632
1633             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1634                         for entry in response['feed']['entry']
1635                         if 'content' in entry ]
1636
1637             if len(response['feed']['entry']) < self._MAX_RESULTS:
1638                 break
1639             page_num += 1
1640
1641         videos = [v[1] for v in sorted(videos)]
1642
1643         url_results = [self.url_result(url, 'Youtube') for url in videos]
1644         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1645
1646
1647 class YoutubeChannelIE(InfoExtractor):
1648     """Information Extractor for YouTube channels."""
1649
1650     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1651     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1652     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1653     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1654     IE_NAME = u'youtube:channel'
1655
1656     def extract_videos_from_page(self, page):
1657         ids_in_page = []
1658         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1659             if mobj.group(1) not in ids_in_page:
1660                 ids_in_page.append(mobj.group(1))
1661         return ids_in_page
1662
1663     def _real_extract(self, url):
1664         # Extract channel id
1665         mobj = re.match(self._VALID_URL, url)
1666         if mobj is None:
1667             raise ExtractorError(u'Invalid URL: %s' % url)
1668
1669         # Download channel page
1670         channel_id = mobj.group(1)
1671         video_ids = []
1672         pagenum = 1
1673
1674         url = self._TEMPLATE_URL % (channel_id, pagenum)
1675         page = self._download_webpage(url, channel_id,
1676                                       u'Downloading page #%s' % pagenum)
1677
1678         # Extract video identifiers
1679         ids_in_page = self.extract_videos_from_page(page)
1680         video_ids.extend(ids_in_page)
1681
1682         # Download any subsequent channel pages using the json-based channel_ajax query
1683         if self._MORE_PAGES_INDICATOR in page:
1684             while True:
1685                 pagenum = pagenum + 1
1686
1687                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1688                 page = self._download_webpage(url, channel_id,
1689                                               u'Downloading page #%s' % pagenum)
1690
1691                 page = json.loads(page)
1692
1693                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1694                 video_ids.extend(ids_in_page)
1695
1696                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1697                     break
1698
1699         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1700
1701         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1702         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1703         return [self.playlist_result(url_entries, channel_id)]
1704
1705
1706 class YoutubeUserIE(InfoExtractor):
1707     """Information Extractor for YouTube users."""
1708
1709     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1710     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1711     _GDATA_PAGE_SIZE = 50
1712     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1713     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1714     IE_NAME = u'youtube:user'
1715
1716     def _real_extract(self, url):
1717         # Extract username
1718         mobj = re.match(self._VALID_URL, url)
1719         if mobj is None:
1720             raise ExtractorError(u'Invalid URL: %s' % url)
1721
1722         username = mobj.group(1)
1723
1724         # Download video ids using YouTube Data API. Result size per
1725         # query is limited (currently to 50 videos) so we need to query
1726         # page by page until there are no video ids - it means we got
1727         # all of them.
1728
1729         video_ids = []
1730         pagenum = 0
1731
1732         while True:
1733             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1734
1735             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1736             page = self._download_webpage(gdata_url, username,
1737                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1738
1739             # Extract video identifiers
1740             ids_in_page = []
1741
1742             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1743                 if mobj.group(1) not in ids_in_page:
1744                     ids_in_page.append(mobj.group(1))
1745
1746             video_ids.extend(ids_in_page)
1747
1748             # A little optimization - if current page is not
1749             # "full", ie. does not contain PAGE_SIZE video ids then
1750             # we can assume that this page is the last one - there
1751             # are no more ids on further pages - no need to query
1752             # again.
1753
1754             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1755                 break
1756
1757             pagenum += 1
1758
1759         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1760         url_results = [self.url_result(url, 'Youtube') for url in urls]
1761         return [self.playlist_result(url_results, playlist_title = username)]
1762
1763
1764 class BlipTVUserIE(InfoExtractor):
1765     """Information Extractor for blip.tv users."""
1766
1767     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1768     _PAGE_SIZE = 12
1769     IE_NAME = u'blip.tv:user'
1770
1771     def _real_extract(self, url):
1772         # Extract username
1773         mobj = re.match(self._VALID_URL, url)
1774         if mobj is None:
1775             raise ExtractorError(u'Invalid URL: %s' % url)
1776
1777         username = mobj.group(1)
1778
1779         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1780
1781         page = self._download_webpage(url, username, u'Downloading user page')
1782         mobj = re.search(r'data-users-id="([^"]+)"', page)
1783         page_base = page_base % mobj.group(1)
1784
1785
1786         # Download video ids using BlipTV Ajax calls. Result size per
1787         # query is limited (currently to 12 videos) so we need to query
1788         # page by page until there are no video ids - it means we got
1789         # all of them.
1790
1791         video_ids = []
1792         pagenum = 1
1793
1794         while True:
1795             url = page_base + "&page=" + str(pagenum)
1796             page = self._download_webpage(url, username,
1797                                           u'Downloading video ids from page %d' % pagenum)
1798
1799             # Extract video identifiers
1800             ids_in_page = []
1801
1802             for mobj in re.finditer(r'href="/([^"]+)"', page):
1803                 if mobj.group(1) not in ids_in_page:
1804                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1805
1806             video_ids.extend(ids_in_page)
1807
1808             # A little optimization - if current page is not
1809             # "full", ie. does not contain PAGE_SIZE video ids then
1810             # we can assume that this page is the last one - there
1811             # are no more ids on further pages - no need to query
1812             # again.
1813
1814             if len(ids_in_page) < self._PAGE_SIZE:
1815                 break
1816
1817             pagenum += 1
1818
1819         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1820         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1821         return [self.playlist_result(url_entries, playlist_title = username)]
1822
1823
1824 class DepositFilesIE(InfoExtractor):
1825     """Information extractor for depositfiles.com"""
1826
1827     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1828
1829     def _real_extract(self, url):
1830         file_id = url.split('/')[-1]
1831         # Rebuild url in english locale
1832         url = 'http://depositfiles.com/en/files/' + file_id
1833
1834         # Retrieve file webpage with 'Free download' button pressed
1835         free_download_indication = { 'gateway_result' : '1' }
1836         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1837         try:
1838             self.report_download_webpage(file_id)
1839             webpage = compat_urllib_request.urlopen(request).read()
1840         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1841             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1842
1843         # Search for the real file URL
1844         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1845         if (mobj is None) or (mobj.group(1) is None):
1846             # Try to figure out reason of the error.
1847             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1848             if (mobj is not None) and (mobj.group(1) is not None):
1849                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1850                 raise ExtractorError(u'%s' % restriction_message)
1851             else:
1852                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1853
1854         file_url = mobj.group(1)
1855         file_extension = os.path.splitext(file_url)[1][1:]
1856
1857         # Search for file title
1858         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1859
1860         return [{
1861             'id':       file_id.decode('utf-8'),
1862             'url':      file_url.decode('utf-8'),
1863             'uploader': None,
1864             'upload_date':  None,
1865             'title':    file_title,
1866             'ext':      file_extension.decode('utf-8'),
1867         }]
1868
1869
1870 class FacebookIE(InfoExtractor):
1871     """Information Extractor for Facebook"""
1872
1873     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1874     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1875     _NETRC_MACHINE = 'facebook'
1876     IE_NAME = u'facebook'
1877
1878     def report_login(self):
1879         """Report attempt to log in."""
1880         self.to_screen(u'Logging in')
1881
1882     def _real_initialize(self):
1883         if self._downloader is None:
1884             return
1885
1886         useremail = None
1887         password = None
1888         downloader_params = self._downloader.params
1889
1890         # Attempt to use provided username and password or .netrc data
1891         if downloader_params.get('username', None) is not None:
1892             useremail = downloader_params['username']
1893             password = downloader_params['password']
1894         elif downloader_params.get('usenetrc', False):
1895             try:
1896                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1897                 if info is not None:
1898                     useremail = info[0]
1899                     password = info[2]
1900                 else:
1901                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1902             except (IOError, netrc.NetrcParseError) as err:
1903                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1904                 return
1905
1906         if useremail is None:
1907             return
1908
1909         # Log in
1910         login_form = {
1911             'email': useremail,
1912             'pass': password,
1913             'login': 'Log+In'
1914             }
1915         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1916         try:
1917             self.report_login()
1918             login_results = compat_urllib_request.urlopen(request).read()
1919             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1920                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1921                 return
1922         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1923             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1924             return
1925
1926     def _real_extract(self, url):
1927         mobj = re.match(self._VALID_URL, url)
1928         if mobj is None:
1929             raise ExtractorError(u'Invalid URL: %s' % url)
1930         video_id = mobj.group('ID')
1931
1932         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1933         webpage = self._download_webpage(url, video_id)
1934
1935         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1936         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1937         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1938         if not m:
1939             raise ExtractorError(u'Cannot parse data')
1940         data = dict(json.loads(m.group(1)))
1941         params_raw = compat_urllib_parse.unquote(data['params'])
1942         params = json.loads(params_raw)
1943         video_data = params['video_data'][0]
1944         video_url = video_data.get('hd_src')
1945         if not video_url:
1946             video_url = video_data['sd_src']
1947         if not video_url:
1948             raise ExtractorError(u'Cannot find video URL')
1949         video_duration = int(video_data['video_duration'])
1950         thumbnail = video_data['thumbnail_src']
1951
1952         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1953             webpage, u'title')
1954
1955         info = {
1956             'id': video_id,
1957             'title': video_title,
1958             'url': video_url,
1959             'ext': 'mp4',
1960             'duration': video_duration,
1961             'thumbnail': thumbnail,
1962         }
1963         return [info]
1964
1965
1966 class BlipTVIE(InfoExtractor):
1967     """Information extractor for blip.tv"""
1968
1969     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1970     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1971     IE_NAME = u'blip.tv'
1972
1973     def report_direct_download(self, title):
1974         """Report information extraction."""
1975         self.to_screen(u'%s: Direct download detected' % title)
1976
1977     def _real_extract(self, url):
1978         mobj = re.match(self._VALID_URL, url)
1979         if mobj is None:
1980             raise ExtractorError(u'Invalid URL: %s' % url)
1981
1982         # See https://github.com/rg3/youtube-dl/issues/857
1983         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1984         if api_mobj is not None:
1985             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1986         urlp = compat_urllib_parse_urlparse(url)
1987         if urlp.path.startswith('/play/'):
1988             request = compat_urllib_request.Request(url)
1989             response = compat_urllib_request.urlopen(request)
1990             redirecturl = response.geturl()
1991             rurlp = compat_urllib_parse_urlparse(redirecturl)
1992             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1993             url = 'http://blip.tv/a/a-' + file_id
1994             return self._real_extract(url)
1995
1996
1997         if '?' in url:
1998             cchar = '&'
1999         else:
2000             cchar = '?'
2001         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2002         request = compat_urllib_request.Request(json_url)
2003         request.add_header('User-Agent', 'iTunes/10.6.1')
2004         self.report_extraction(mobj.group(1))
2005         info = None
2006         try:
2007             urlh = compat_urllib_request.urlopen(request)
2008             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2009                 basename = url.split('/')[-1]
2010                 title,ext = os.path.splitext(basename)
2011                 title = title.decode('UTF-8')
2012                 ext = ext.replace('.', '')
2013                 self.report_direct_download(title)
2014                 info = {
2015                     'id': title,
2016                     'url': url,
2017                     'uploader': None,
2018                     'upload_date': None,
2019                     'title': title,
2020                     'ext': ext,
2021                     'urlhandle': urlh
2022                 }
2023         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2024             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2025         if info is None: # Regular URL
2026             try:
2027                 json_code_bytes = urlh.read()
2028                 json_code = json_code_bytes.decode('utf-8')
2029             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2030                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2031
2032             try:
2033                 json_data = json.loads(json_code)
2034                 if 'Post' in json_data:
2035                     data = json_data['Post']
2036                 else:
2037                     data = json_data
2038
2039                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2040                 video_url = data['media']['url']
2041                 umobj = re.match(self._URL_EXT, video_url)
2042                 if umobj is None:
2043                     raise ValueError('Can not determine filename extension')
2044                 ext = umobj.group(1)
2045
2046                 info = {
2047                     'id': data['item_id'],
2048                     'url': video_url,
2049                     'uploader': data['display_name'],
2050                     'upload_date': upload_date,
2051                     'title': data['title'],
2052                     'ext': ext,
2053                     'format': data['media']['mimeType'],
2054                     'thumbnail': data['thumbnailUrl'],
2055                     'description': data['description'],
2056                     'player_url': data['embedUrl'],
2057                     'user_agent': 'iTunes/10.6.1',
2058                 }
2059             except (ValueError,KeyError) as err:
2060                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2061
2062         return [info]
2063
2064
2065 class MyVideoIE(InfoExtractor):
2066     """Information Extractor for myvideo.de."""
2067
2068     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2069     IE_NAME = u'myvideo'
2070
2071     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2072     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2073     # https://github.com/rg3/youtube-dl/pull/842
2074     def __rc4crypt(self,data, key):
2075         x = 0
2076         box = list(range(256))
2077         for i in list(range(256)):
2078             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2079             box[i], box[x] = box[x], box[i]
2080         x = 0
2081         y = 0
2082         out = ''
2083         for char in data:
2084             x = (x + 1) % 256
2085             y = (y + box[x]) % 256
2086             box[x], box[y] = box[y], box[x]
2087             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2088         return out
2089
2090     def __md5(self,s):
2091         return hashlib.md5(s).hexdigest().encode()
2092
2093     def _real_extract(self,url):
2094         mobj = re.match(self._VALID_URL, url)
2095         if mobj is None:
2096             raise ExtractorError(u'invalid URL: %s' % url)
2097
2098         video_id = mobj.group(1)
2099
2100         GK = (
2101           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2102           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2103           b'TnpsbA0KTVRkbU1tSTRNdz09'
2104         )
2105
2106         # Get video webpage
2107         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2108         webpage = self._download_webpage(webpage_url, video_id)
2109
2110         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2111         if mobj is not None:
2112             self.report_extraction(video_id)
2113             video_url = mobj.group(1) + '.flv'
2114
2115             video_title = self._html_search_regex('<title>([^<]+)</title>',
2116                 webpage, u'title')
2117
2118             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2119
2120             return [{
2121                 'id':       video_id,
2122                 'url':      video_url,
2123                 'uploader': None,
2124                 'upload_date':  None,
2125                 'title':    video_title,
2126                 'ext':      u'flv',
2127             }]
2128
2129         # try encxml
2130         mobj = re.search('var flashvars={(.+?)}', webpage)
2131         if mobj is None:
2132             raise ExtractorError(u'Unable to extract video')
2133
2134         params = {}
2135         encxml = ''
2136         sec = mobj.group(1)
2137         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2138             if not a == '_encxml':
2139                 params[a] = b
2140             else:
2141                 encxml = compat_urllib_parse.unquote(b)
2142         if not params.get('domain'):
2143             params['domain'] = 'www.myvideo.de'
2144         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2145         if 'flash_playertype=MTV' in xmldata_url:
2146             self._downloader.report_warning(u'avoiding MTV player')
2147             xmldata_url = (
2148                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2149                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2150             ) % video_id
2151
2152         # get enc data
2153         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2154         enc_data_b = binascii.unhexlify(enc_data)
2155         sk = self.__md5(
2156             base64.b64decode(base64.b64decode(GK)) +
2157             self.__md5(
2158                 str(video_id).encode('utf-8')
2159             )
2160         )
2161         dec_data = self.__rc4crypt(enc_data_b, sk)
2162
2163         # extracting infos
2164         self.report_extraction(video_id)
2165
2166         video_url = None
2167         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2168         if mobj:
2169             video_url = compat_urllib_parse.unquote(mobj.group(1))
2170             if 'myvideo2flash' in video_url:
2171                 self._downloader.report_warning(u'forcing RTMPT ...')
2172                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2173
2174         if not video_url:
2175             # extract non rtmp videos
2176             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2177             if mobj is None:
2178                 raise ExtractorError(u'unable to extract url')
2179             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2180
2181         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2182         video_file = compat_urllib_parse.unquote(video_file)
2183
2184         if not video_file.endswith('f4m'):
2185             ppath, prefix = video_file.split('.')
2186             video_playpath = '%s:%s' % (prefix, ppath)
2187             video_hls_playlist = ''
2188         else:
2189             video_playpath = ''
2190             video_hls_playlist = (
2191                 video_filepath + video_file
2192             ).replace('.f4m', '.m3u8')
2193
2194         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2195         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2196
2197         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2198             webpage, u'title')
2199
2200         return [{
2201             'id':                 video_id,
2202             'url':                video_url,
2203             'tc_url':             video_url,
2204             'uploader':           None,
2205             'upload_date':        None,
2206             'title':              video_title,
2207             'ext':                u'flv',
2208             'play_path':          video_playpath,
2209             'video_file':         video_file,
2210             'video_hls_playlist': video_hls_playlist,
2211             'player_url':         video_swfobj,
2212         }]
2213
2214
2215 class ComedyCentralIE(InfoExtractor):
2216     """Information extractor for The Daily Show and Colbert Report """
2217
2218     # urls can be abbreviations like :thedailyshow or :colbert
2219     # urls for episodes like:
2220     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2221     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2222     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2223     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2224                       |(https?://)?(www\.)?
2225                           (?P<showname>thedailyshow|colbertnation)\.com/
2226                          (full-episodes/(?P<episode>.*)|
2227                           (?P<clip>
2228                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2229                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2230                      $"""
2231
2232     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2233
2234     _video_extensions = {
2235         '3500': 'mp4',
2236         '2200': 'mp4',
2237         '1700': 'mp4',
2238         '1200': 'mp4',
2239         '750': 'mp4',
2240         '400': 'mp4',
2241     }
2242     _video_dimensions = {
2243         '3500': '1280x720',
2244         '2200': '960x540',
2245         '1700': '768x432',
2246         '1200': '640x360',
2247         '750': '512x288',
2248         '400': '384x216',
2249     }
2250
2251     @classmethod
2252     def suitable(cls, url):
2253         """Receives a URL and returns True if suitable for this IE."""
2254         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2255
2256     def _print_formats(self, formats):
2257         print('Available formats:')
2258         for x in formats:
2259             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2260
2261
2262     def _real_extract(self, url):
2263         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2264         if mobj is None:
2265             raise ExtractorError(u'Invalid URL: %s' % url)
2266
2267         if mobj.group('shortname'):
2268             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2269                 url = u'http://www.thedailyshow.com/full-episodes/'
2270             else:
2271                 url = u'http://www.colbertnation.com/full-episodes/'
2272             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2273             assert mobj is not None
2274
2275         if mobj.group('clip'):
2276             if mobj.group('showname') == 'thedailyshow':
2277                 epTitle = mobj.group('tdstitle')
2278             else:
2279                 epTitle = mobj.group('cntitle')
2280             dlNewest = False
2281         else:
2282             dlNewest = not mobj.group('episode')
2283             if dlNewest:
2284                 epTitle = mobj.group('showname')
2285             else:
2286                 epTitle = mobj.group('episode')
2287
2288         self.report_extraction(epTitle)
2289         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2290         if dlNewest:
2291             url = htmlHandle.geturl()
2292             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2293             if mobj is None:
2294                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2295             if mobj.group('episode') == '':
2296                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2297             epTitle = mobj.group('episode')
2298
2299         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2300
2301         if len(mMovieParams) == 0:
2302             # The Colbert Report embeds the information in a without
2303             # a URL prefix; so extract the alternate reference
2304             # and then add the URL prefix manually.
2305
2306             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2307             if len(altMovieParams) == 0:
2308                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2309             else:
2310                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2311
2312         uri = mMovieParams[0][1]
2313         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2314         indexXml = self._download_webpage(indexUrl, epTitle,
2315                                           u'Downloading show index',
2316                                           u'unable to download episode index')
2317
2318         results = []
2319
2320         idoc = xml.etree.ElementTree.fromstring(indexXml)
2321         itemEls = idoc.findall('.//item')
2322         for partNum,itemEl in enumerate(itemEls):
2323             mediaId = itemEl.findall('./guid')[0].text
2324             shortMediaId = mediaId.split(':')[-1]
2325             showId = mediaId.split(':')[-2].replace('.com', '')
2326             officialTitle = itemEl.findall('./title')[0].text
2327             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2328
2329             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2330                         compat_urllib_parse.urlencode({'uri': mediaId}))
2331             configXml = self._download_webpage(configUrl, epTitle,
2332                                                u'Downloading configuration for %s' % shortMediaId)
2333
2334             cdoc = xml.etree.ElementTree.fromstring(configXml)
2335             turls = []
2336             for rendition in cdoc.findall('.//rendition'):
2337                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2338                 turls.append(finfo)
2339
2340             if len(turls) == 0:
2341                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2342                 continue
2343
2344             if self._downloader.params.get('listformats', None):
2345                 self._print_formats([i[0] for i in turls])
2346                 return
2347
2348             # For now, just pick the highest bitrate
2349             format,rtmp_video_url = turls[-1]
2350
2351             # Get the format arg from the arg stream
2352             req_format = self._downloader.params.get('format', None)
2353
2354             # Select format if we can find one
2355             for f,v in turls:
2356                 if f == req_format:
2357                     format, rtmp_video_url = f, v
2358                     break
2359
2360             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2361             if not m:
2362                 raise ExtractorError(u'Cannot transform RTMP url')
2363             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2364             video_url = base + m.group('finalid')
2365
2366             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2367             info = {
2368                 'id': shortMediaId,
2369                 'url': video_url,
2370                 'uploader': showId,
2371                 'upload_date': officialDate,
2372                 'title': effTitle,
2373                 'ext': 'mp4',
2374                 'format': format,
2375                 'thumbnail': None,
2376                 'description': officialTitle,
2377             }
2378             results.append(info)
2379
2380         return results
2381
2382
2383 class EscapistIE(InfoExtractor):
2384     """Information extractor for The Escapist """
2385
2386     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2387     IE_NAME = u'escapist'
2388
2389     def _real_extract(self, url):
2390         mobj = re.match(self._VALID_URL, url)
2391         if mobj is None:
2392             raise ExtractorError(u'Invalid URL: %s' % url)
2393         showName = mobj.group('showname')
2394         videoId = mobj.group('episode')
2395
2396         self.report_extraction(videoId)
2397         webpage = self._download_webpage(url, videoId)
2398
2399         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2400             webpage, u'description', fatal=False)
2401
2402         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2403             webpage, u'thumbnail', fatal=False)
2404
2405         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2406             webpage, u'player url')
2407
2408         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2409             webpage, u'player url').split(' : ')[-1]
2410
2411         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2412         configUrl = compat_urllib_parse.unquote(configUrl)
2413
2414         configJSON = self._download_webpage(configUrl, videoId,
2415                                             u'Downloading configuration',
2416                                             u'unable to download configuration')
2417
2418         # Technically, it's JavaScript, not JSON
2419         configJSON = configJSON.replace("'", '"')
2420
2421         try:
2422             config = json.loads(configJSON)
2423         except (ValueError,) as err:
2424             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2425
2426         playlist = config['playlist']
2427         videoUrl = playlist[1]['url']
2428
2429         info = {
2430             'id': videoId,
2431             'url': videoUrl,
2432             'uploader': showName,
2433             'upload_date': None,
2434             'title': title,
2435             'ext': 'mp4',
2436             'thumbnail': imgUrl,
2437             'description': videoDesc,
2438             'player_url': playerUrl,
2439         }
2440
2441         return [info]
2442
2443 class CollegeHumorIE(InfoExtractor):
2444     """Information extractor for collegehumor.com"""
2445
2446     _WORKING = False
2447     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2448     IE_NAME = u'collegehumor'
2449
2450     def report_manifest(self, video_id):
2451         """Report information extraction."""
2452         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2453
2454     def _real_extract(self, url):
2455         mobj = re.match(self._VALID_URL, url)
2456         if mobj is None:
2457             raise ExtractorError(u'Invalid URL: %s' % url)
2458         video_id = mobj.group('videoid')
2459
2460         info = {
2461             'id': video_id,
2462             'uploader': None,
2463             'upload_date': None,
2464         }
2465
2466         self.report_extraction(video_id)
2467         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2468         try:
2469             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2470         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2471             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2472
2473         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2474         try:
2475             videoNode = mdoc.findall('./video')[0]
2476             info['description'] = videoNode.findall('./description')[0].text
2477             info['title'] = videoNode.findall('./caption')[0].text
2478             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2479             manifest_url = videoNode.findall('./file')[0].text
2480         except IndexError:
2481             raise ExtractorError(u'Invalid metadata XML file')
2482
2483         manifest_url += '?hdcore=2.10.3'
2484         self.report_manifest(video_id)
2485         try:
2486             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2487         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2488             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2489
2490         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2491         try:
2492             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2493             node_id = media_node.attrib['url']
2494             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2495         except IndexError as err:
2496             raise ExtractorError(u'Invalid manifest file')
2497
2498         url_pr = compat_urllib_parse_urlparse(manifest_url)
2499         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2500
2501         info['url'] = url
2502         info['ext'] = 'f4f'
2503         return [info]
2504
2505
2506 class XVideosIE(InfoExtractor):
2507     """Information extractor for xvideos.com"""
2508
2509     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2510     IE_NAME = u'xvideos'
2511
2512     def _real_extract(self, url):
2513         mobj = re.match(self._VALID_URL, url)
2514         if mobj is None:
2515             raise ExtractorError(u'Invalid URL: %s' % url)
2516         video_id = mobj.group(1)
2517
2518         webpage = self._download_webpage(url, video_id)
2519
2520         self.report_extraction(video_id)
2521
2522         # Extract video URL
2523         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2524             webpage, u'video URL'))
2525
2526         # Extract title
2527         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2528             webpage, u'title')
2529
2530         # Extract video thumbnail
2531         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2532             webpage, u'thumbnail', fatal=False)
2533
2534         info = {
2535             'id': video_id,
2536             'url': video_url,
2537             'uploader': None,
2538             'upload_date': None,
2539             'title': video_title,
2540             'ext': 'flv',
2541             'thumbnail': video_thumbnail,
2542             'description': None,
2543         }
2544
2545         return [info]
2546
2547
2548 class SoundcloudIE(InfoExtractor):
2549     """Information extractor for soundcloud.com
2550        To access the media, the uid of the song and a stream token
2551        must be extracted from the page source and the script must make
2552        a request to media.soundcloud.com/crossdomain.xml. Then
2553        the media can be grabbed by requesting from an url composed
2554        of the stream token and uid
2555      """
2556
2557     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2558     IE_NAME = u'soundcloud'
2559
2560     def report_resolve(self, video_id):
2561         """Report information extraction."""
2562         self.to_screen(u'%s: Resolving id' % video_id)
2563
2564     def _real_extract(self, url):
2565         mobj = re.match(self._VALID_URL, url)
2566         if mobj is None:
2567             raise ExtractorError(u'Invalid URL: %s' % url)
2568
2569         # extract uploader (which is in the url)
2570         uploader = mobj.group(1)
2571         # extract simple title (uploader + slug of song title)
2572         slug_title =  mobj.group(2)
2573         simple_title = uploader + u'-' + slug_title
2574         full_title = '%s/%s' % (uploader, slug_title)
2575
2576         self.report_resolve(full_title)
2577
2578         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2579         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2580         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2581
2582         info = json.loads(info_json)
2583         video_id = info['id']
2584         self.report_extraction(full_title)
2585
2586         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2587         stream_json = self._download_webpage(streams_url, full_title,
2588                                              u'Downloading stream definitions',
2589                                              u'unable to download stream definitions')
2590
2591         streams = json.loads(stream_json)
2592         mediaURL = streams['http_mp3_128_url']
2593         upload_date = unified_strdate(info['created_at'])
2594
2595         return [{
2596             'id':       info['id'],
2597             'url':      mediaURL,
2598             'uploader': info['user']['username'],
2599             'upload_date': upload_date,
2600             'title':    info['title'],
2601             'ext':      u'mp3',
2602             'description': info['description'],
2603         }]
2604
2605 class SoundcloudSetIE(InfoExtractor):
2606     """Information extractor for soundcloud.com sets
2607        To access the media, the uid of the song and a stream token
2608        must be extracted from the page source and the script must make
2609        a request to media.soundcloud.com/crossdomain.xml. Then
2610        the media can be grabbed by requesting from an url composed
2611        of the stream token and uid
2612      """
2613
2614     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2615     IE_NAME = u'soundcloud:set'
2616
2617     def report_resolve(self, video_id):
2618         """Report information extraction."""
2619         self.to_screen(u'%s: Resolving id' % video_id)
2620
2621     def _real_extract(self, url):
2622         mobj = re.match(self._VALID_URL, url)
2623         if mobj is None:
2624             raise ExtractorError(u'Invalid URL: %s' % url)
2625
2626         # extract uploader (which is in the url)
2627         uploader = mobj.group(1)
2628         # extract simple title (uploader + slug of song title)
2629         slug_title =  mobj.group(2)
2630         simple_title = uploader + u'-' + slug_title
2631         full_title = '%s/sets/%s' % (uploader, slug_title)
2632
2633         self.report_resolve(full_title)
2634
2635         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2636         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2637         info_json = self._download_webpage(resolv_url, full_title)
2638
2639         videos = []
2640         info = json.loads(info_json)
2641         if 'errors' in info:
2642             for err in info['errors']:
2643                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2644             return
2645
2646         self.report_extraction(full_title)
2647         for track in info['tracks']:
2648             video_id = track['id']
2649
2650             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2651             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2652
2653             self.report_extraction(video_id)
2654             streams = json.loads(stream_json)
2655             mediaURL = streams['http_mp3_128_url']
2656
2657             videos.append({
2658                 'id':       video_id,
2659                 'url':      mediaURL,
2660                 'uploader': track['user']['username'],
2661                 'upload_date':  unified_strdate(track['created_at']),
2662                 'title':    track['title'],
2663                 'ext':      u'mp3',
2664                 'description': track['description'],
2665             })
2666         return videos
2667
2668
2669 class InfoQIE(InfoExtractor):
2670     """Information extractor for infoq.com"""
2671     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2672
2673     def _real_extract(self, url):
2674         mobj = re.match(self._VALID_URL, url)
2675         if mobj is None:
2676             raise ExtractorError(u'Invalid URL: %s' % url)
2677
2678         webpage = self._download_webpage(url, video_id=url)
2679         self.report_extraction(url)
2680
2681         # Extract video URL
2682         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2683         if mobj is None:
2684             raise ExtractorError(u'Unable to extract video url')
2685         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2686         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2687
2688         # Extract title
2689         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2690             webpage, u'title')
2691
2692         # Extract description
2693         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2694             webpage, u'description', fatal=False)
2695
2696         video_filename = video_url.split('/')[-1]
2697         video_id, extension = video_filename.split('.')
2698
2699         info = {
2700             'id': video_id,
2701             'url': video_url,
2702             'uploader': None,
2703             'upload_date': None,
2704             'title': video_title,
2705             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2706             'thumbnail': None,
2707             'description': video_description,
2708         }
2709
2710         return [info]
2711
2712 class MixcloudIE(InfoExtractor):
2713     """Information extractor for www.mixcloud.com"""
2714
2715     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2716     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2717     IE_NAME = u'mixcloud'
2718
2719     def report_download_json(self, file_id):
2720         """Report JSON download."""
2721         self.to_screen(u'Downloading json')
2722
2723     def get_urls(self, jsonData, fmt, bitrate='best'):
2724         """Get urls from 'audio_formats' section in json"""
2725         file_url = None
2726         try:
2727             bitrate_list = jsonData[fmt]
2728             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2729                 bitrate = max(bitrate_list) # select highest
2730
2731             url_list = jsonData[fmt][bitrate]
2732         except TypeError: # we have no bitrate info.
2733             url_list = jsonData[fmt]
2734         return url_list
2735
2736     def check_urls(self, url_list):
2737         """Returns 1st active url from list"""
2738         for url in url_list:
2739             try:
2740                 compat_urllib_request.urlopen(url)
2741                 return url
2742             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2743                 url = None
2744
2745         return None
2746
2747     def _print_formats(self, formats):
2748         print('Available formats:')
2749         for fmt in formats.keys():
2750             for b in formats[fmt]:
2751                 try:
2752                     ext = formats[fmt][b][0]
2753                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2754                 except TypeError: # we have no bitrate info
2755                     ext = formats[fmt][0]
2756                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2757                     break
2758
2759     def _real_extract(self, url):
2760         mobj = re.match(self._VALID_URL, url)
2761         if mobj is None:
2762             raise ExtractorError(u'Invalid URL: %s' % url)
2763         # extract uploader & filename from url
2764         uploader = mobj.group(1).decode('utf-8')
2765         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2766
2767         # construct API request
2768         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2769         # retrieve .json file with links to files
2770         request = compat_urllib_request.Request(file_url)
2771         try:
2772             self.report_download_json(file_url)
2773             jsonData = compat_urllib_request.urlopen(request).read()
2774         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2775             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2776
2777         # parse JSON
2778         json_data = json.loads(jsonData)
2779         player_url = json_data['player_swf_url']
2780         formats = dict(json_data['audio_formats'])
2781
2782         req_format = self._downloader.params.get('format', None)
2783         bitrate = None
2784
2785         if self._downloader.params.get('listformats', None):
2786             self._print_formats(formats)
2787             return
2788
2789         if req_format is None or req_format == 'best':
2790             for format_param in formats.keys():
2791                 url_list = self.get_urls(formats, format_param)
2792                 # check urls
2793                 file_url = self.check_urls(url_list)
2794                 if file_url is not None:
2795                     break # got it!
2796         else:
2797             if req_format not in formats:
2798                 raise ExtractorError(u'Format is not available')
2799
2800             url_list = self.get_urls(formats, req_format)
2801             file_url = self.check_urls(url_list)
2802             format_param = req_format
2803
2804         return [{
2805             'id': file_id.decode('utf-8'),
2806             'url': file_url.decode('utf-8'),
2807             'uploader': uploader.decode('utf-8'),
2808             'upload_date': None,
2809             'title': json_data['name'],
2810             'ext': file_url.split('.')[-1].decode('utf-8'),
2811             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2812             'thumbnail': json_data['thumbnail_url'],
2813             'description': json_data['description'],
2814             'player_url': player_url.decode('utf-8'),
2815         }]
2816
2817 class StanfordOpenClassroomIE(InfoExtractor):
2818     """Information extractor for Stanford's Open ClassRoom"""
2819
2820     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2821     IE_NAME = u'stanfordoc'
2822
2823     def _real_extract(self, url):
2824         mobj = re.match(self._VALID_URL, url)
2825         if mobj is None:
2826             raise ExtractorError(u'Invalid URL: %s' % url)
2827
2828         if mobj.group('course') and mobj.group('video'): # A specific video
2829             course = mobj.group('course')
2830             video = mobj.group('video')
2831             info = {
2832                 'id': course + '_' + video,
2833                 'uploader': None,
2834                 'upload_date': None,
2835             }
2836
2837             self.report_extraction(info['id'])
2838             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2839             xmlUrl = baseUrl + video + '.xml'
2840             try:
2841                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2842             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2843                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2844             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2845             try:
2846                 info['title'] = mdoc.findall('./title')[0].text
2847                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2848             except IndexError:
2849                 raise ExtractorError(u'Invalid metadata XML file')
2850             info['ext'] = info['url'].rpartition('.')[2]
2851             return [info]
2852         elif mobj.group('course'): # A course page
2853             course = mobj.group('course')
2854             info = {
2855                 'id': course,
2856                 'type': 'playlist',
2857                 'uploader': None,
2858                 'upload_date': None,
2859             }
2860
2861             coursepage = self._download_webpage(url, info['id'],
2862                                         note='Downloading course info page',
2863                                         errnote='Unable to download course info page')
2864
2865             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2866
2867             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2868                 coursepage, u'description', fatal=False)
2869
2870             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2871             info['list'] = [
2872                 {
2873                     'type': 'reference',
2874                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2875                 }
2876                     for vpage in links]
2877             results = []
2878             for entry in info['list']:
2879                 assert entry['type'] == 'reference'
2880                 results += self.extract(entry['url'])
2881             return results
2882         else: # Root page
2883             info = {
2884                 'id': 'Stanford OpenClassroom',
2885                 'type': 'playlist',
2886                 'uploader': None,
2887                 'upload_date': None,
2888             }
2889
2890             self.report_download_webpage(info['id'])
2891             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2892             try:
2893                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2894             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2895                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2896
2897             info['title'] = info['id']
2898
2899             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2900             info['list'] = [
2901                 {
2902                     'type': 'reference',
2903                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2904                 }
2905                     for cpage in links]
2906
2907             results = []
2908             for entry in info['list']:
2909                 assert entry['type'] == 'reference'
2910                 results += self.extract(entry['url'])
2911             return results
2912
2913 class MTVIE(InfoExtractor):
2914     """Information extractor for MTV.com"""
2915
2916     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2917     IE_NAME = u'mtv'
2918
2919     def _real_extract(self, url):
2920         mobj = re.match(self._VALID_URL, url)
2921         if mobj is None:
2922             raise ExtractorError(u'Invalid URL: %s' % url)
2923         if not mobj.group('proto'):
2924             url = 'http://' + url
2925         video_id = mobj.group('videoid')
2926
2927         webpage = self._download_webpage(url, video_id)
2928
2929         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2930             webpage, u'song name', fatal=False)
2931
2932         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2933             webpage, u'title')
2934
2935         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2936             webpage, u'mtvn_uri', fatal=False)
2937
2938         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2939             webpage, u'content id', fatal=False)
2940
2941         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2942         self.report_extraction(video_id)
2943         request = compat_urllib_request.Request(videogen_url)
2944         try:
2945             metadataXml = compat_urllib_request.urlopen(request).read()
2946         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2947             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2948
2949         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2950         renditions = mdoc.findall('.//rendition')
2951
2952         # For now, always pick the highest quality.
2953         rendition = renditions[-1]
2954
2955         try:
2956             _,_,ext = rendition.attrib['type'].partition('/')
2957             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2958             video_url = rendition.find('./src').text
2959         except KeyError:
2960             raise ExtractorError('Invalid rendition field.')
2961
2962         info = {
2963             'id': video_id,
2964             'url': video_url,
2965             'uploader': performer,
2966             'upload_date': None,
2967             'title': video_title,
2968             'ext': ext,
2969             'format': format,
2970         }
2971
2972         return [info]
2973
2974
2975 class YoukuIE(InfoExtractor):
2976     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2977
2978     def _gen_sid(self):
2979         nowTime = int(time.time() * 1000)
2980         random1 = random.randint(1000,1998)
2981         random2 = random.randint(1000,9999)
2982
2983         return "%d%d%d" %(nowTime,random1,random2)
2984
2985     def _get_file_ID_mix_string(self, seed):
2986         mixed = []
2987         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2988         seed = float(seed)
2989         for i in range(len(source)):
2990             seed  =  (seed * 211 + 30031 ) % 65536
2991             index  =  math.floor(seed / 65536 * len(source) )
2992             mixed.append(source[int(index)])
2993             source.remove(source[int(index)])
2994         #return ''.join(mixed)
2995         return mixed
2996
2997     def _get_file_id(self, fileId, seed):
2998         mixed = self._get_file_ID_mix_string(seed)
2999         ids = fileId.split('*')
3000         realId = []
3001         for ch in ids:
3002             if ch:
3003                 realId.append(mixed[int(ch)])
3004         return ''.join(realId)
3005
3006     def _real_extract(self, url):
3007         mobj = re.match(self._VALID_URL, url)
3008         if mobj is None:
3009             raise ExtractorError(u'Invalid URL: %s' % url)
3010         video_id = mobj.group('ID')
3011
3012         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3013
3014         jsondata = self._download_webpage(info_url, video_id)
3015
3016         self.report_extraction(video_id)
3017         try:
3018             config = json.loads(jsondata)
3019
3020             video_title =  config['data'][0]['title']
3021             seed = config['data'][0]['seed']
3022
3023             format = self._downloader.params.get('format', None)
3024             supported_format = list(config['data'][0]['streamfileids'].keys())
3025
3026             if format is None or format == 'best':
3027                 if 'hd2' in supported_format:
3028                     format = 'hd2'
3029                 else:
3030                     format = 'flv'
3031                 ext = u'flv'
3032             elif format == 'worst':
3033                 format = 'mp4'
3034                 ext = u'mp4'
3035             else:
3036                 format = 'flv'
3037                 ext = u'flv'
3038
3039
3040             fileid = config['data'][0]['streamfileids'][format]
3041             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3042         except (UnicodeDecodeError, ValueError, KeyError):
3043             raise ExtractorError(u'Unable to extract info section')
3044
3045         files_info=[]
3046         sid = self._gen_sid()
3047         fileid = self._get_file_id(fileid, seed)
3048
3049         #column 8,9 of fileid represent the segment number
3050         #fileid[7:9] should be changed
3051         for index, key in enumerate(keys):
3052
3053             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3054             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3055
3056             info = {
3057                 'id': '%s_part%02d' % (video_id, index),
3058                 'url': download_url,
3059                 'uploader': None,
3060                 'upload_date': None,
3061                 'title': video_title,
3062                 'ext': ext,
3063             }
3064             files_info.append(info)
3065
3066         return files_info
3067
3068
3069 class XNXXIE(InfoExtractor):
3070     """Information extractor for xnxx.com"""
3071
3072     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3073     IE_NAME = u'xnxx'
3074     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3075     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3076     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3077
3078     def _real_extract(self, url):
3079         mobj = re.match(self._VALID_URL, url)
3080         if mobj is None:
3081             raise ExtractorError(u'Invalid URL: %s' % url)
3082         video_id = mobj.group(1)
3083
3084         # Get webpage content
3085         webpage = self._download_webpage(url, video_id)
3086
3087         video_url = self._search_regex(self.VIDEO_URL_RE,
3088             webpage, u'video URL')
3089         video_url = compat_urllib_parse.unquote(video_url)
3090
3091         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3092             webpage, u'title')
3093
3094         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3095             webpage, u'thumbnail', fatal=False)
3096
3097         return [{
3098             'id': video_id,
3099             'url': video_url,
3100             'uploader': None,
3101             'upload_date': None,
3102             'title': video_title,
3103             'ext': 'flv',
3104             'thumbnail': video_thumbnail,
3105             'description': None,
3106         }]
3107
3108
3109 class GooglePlusIE(InfoExtractor):
3110     """Information extractor for plus.google.com."""
3111
3112     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3113     IE_NAME = u'plus.google'
3114
3115     def _real_extract(self, url):
3116         # Extract id from URL
3117         mobj = re.match(self._VALID_URL, url)
3118         if mobj is None:
3119             raise ExtractorError(u'Invalid URL: %s' % url)
3120
3121         post_url = mobj.group(0)
3122         video_id = mobj.group(1)
3123
3124         video_extension = 'flv'
3125
3126         # Step 1, Retrieve post webpage to extract further information
3127         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3128
3129         self.report_extraction(video_id)
3130
3131         # Extract update date
3132         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3133             webpage, u'upload date', fatal=False)
3134         if upload_date:
3135             # Convert timestring to a format suitable for filename
3136             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3137             upload_date = upload_date.strftime('%Y%m%d')
3138
3139         # Extract uploader
3140         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3141             webpage, u'uploader', fatal=False)
3142
3143         # Extract title
3144         # Get the first line for title
3145         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3146             webpage, 'title', default=u'NA')
3147
3148         # Step 2, Stimulate clicking the image box to launch video
3149         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3150             webpage, u'video page URL')
3151         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3152
3153         # Extract video links on video page
3154         """Extract video links of all sizes"""
3155         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3156         mobj = re.findall(pattern, webpage)
3157         if len(mobj) == 0:
3158             raise ExtractorError(u'Unable to extract video links')
3159
3160         # Sort in resolution
3161         links = sorted(mobj)
3162
3163         # Choose the lowest of the sort, i.e. highest resolution
3164         video_url = links[-1]
3165         # Only get the url. The resolution part in the tuple has no use anymore
3166         video_url = video_url[-1]
3167         # Treat escaped \u0026 style hex
3168         try:
3169             video_url = video_url.decode("unicode_escape")
3170         except AttributeError: # Python 3
3171             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3172
3173
3174         return [{
3175             'id':       video_id,
3176             'url':      video_url,
3177             'uploader': uploader,
3178             'upload_date':  upload_date,
3179             'title':    video_title,
3180             'ext':      video_extension,
3181         }]
3182
3183 class NBAIE(InfoExtractor):
3184     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3185     IE_NAME = u'nba'
3186
3187     def _real_extract(self, url):
3188         mobj = re.match(self._VALID_URL, url)
3189         if mobj is None:
3190             raise ExtractorError(u'Invalid URL: %s' % url)
3191
3192         video_id = mobj.group(1)
3193
3194         webpage = self._download_webpage(url, video_id)
3195
3196         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3197
3198         shortened_video_id = video_id.rpartition('/')[2]
3199         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3200             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3201
3202         # It isn't there in the HTML it returns to us
3203         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3204
3205         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3206
3207         info = {
3208             'id': shortened_video_id,
3209             'url': video_url,
3210             'ext': 'mp4',
3211             'title': title,
3212             # 'uploader_date': uploader_date,
3213             'description': description,
3214         }
3215         return [info]
3216
3217 class JustinTVIE(InfoExtractor):
3218     """Information extractor for justin.tv and twitch.tv"""
3219     # TODO: One broadcast may be split into multiple videos. The key
3220     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3221     # starts at 1 and increases. Can we treat all parts as one video?
3222
3223     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3224         (?:
3225             (?P<channelid>[^/]+)|
3226             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3227             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3228         )
3229         /?(?:\#.*)?$
3230         """
3231     _JUSTIN_PAGE_LIMIT = 100
3232     IE_NAME = u'justin.tv'
3233
3234     def report_download_page(self, channel, offset):
3235         """Report attempt to download a single page of videos."""
3236         self.to_screen(u'%s: Downloading video information from %d to %d' %
3237                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3238
3239     # Return count of items, list of *valid* items
3240     def _parse_page(self, url, video_id):
3241         webpage = self._download_webpage(url, video_id,
3242                                          u'Downloading video info JSON',
3243                                          u'unable to download video info JSON')
3244
3245         response = json.loads(webpage)
3246         if type(response) != list:
3247             error_text = response.get('error', 'unknown error')
3248             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3249         info = []
3250         for clip in response:
3251             video_url = clip['video_file_url']
3252             if video_url:
3253                 video_extension = os.path.splitext(video_url)[1][1:]
3254                 video_date = re.sub('-', '', clip['start_time'][:10])
3255                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3256                 video_id = clip['id']
3257                 video_title = clip.get('title', video_id)
3258                 info.append({
3259                     'id': video_id,
3260                     'url': video_url,
3261                     'title': video_title,
3262                     'uploader': clip.get('channel_name', video_uploader_id),
3263                     'uploader_id': video_uploader_id,
3264                     'upload_date': video_date,
3265                     'ext': video_extension,
3266                 })
3267         return (len(response), info)
3268
3269     def _real_extract(self, url):
3270         mobj = re.match(self._VALID_URL, url)
3271         if mobj is None:
3272             raise ExtractorError(u'invalid URL: %s' % url)
3273
3274         api_base = 'http://api.justin.tv'
3275         paged = False
3276         if mobj.group('channelid'):
3277             paged = True
3278             video_id = mobj.group('channelid')
3279             api = api_base + '/channel/archives/%s.json' % video_id
3280         elif mobj.group('chapterid'):
3281             chapter_id = mobj.group('chapterid')
3282
3283             webpage = self._download_webpage(url, chapter_id)
3284             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3285             if not m:
3286                 raise ExtractorError(u'Cannot find archive of a chapter')
3287             archive_id = m.group(1)
3288
3289             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3290             chapter_info_xml = self._download_webpage(api, chapter_id,
3291                                              note=u'Downloading chapter information',
3292                                              errnote=u'Chapter information download failed')
3293             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3294             for a in doc.findall('.//archive'):
3295                 if archive_id == a.find('./id').text:
3296                     break
3297             else:
3298                 raise ExtractorError(u'Could not find chapter in chapter information')
3299
3300             video_url = a.find('./video_file_url').text
3301             video_ext = video_url.rpartition('.')[2] or u'flv'
3302
3303             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3304             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3305                                    note='Downloading chapter metadata',
3306                                    errnote='Download of chapter metadata failed')
3307             chapter_info = json.loads(chapter_info_json)
3308
3309             bracket_start = int(doc.find('.//bracket_start').text)
3310             bracket_end = int(doc.find('.//bracket_end').text)
3311
3312             # TODO determine start (and probably fix up file)
3313             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3314             #video_url += u'?start=' + TODO:start_timestamp
3315             # bracket_start is 13290, but we want 51670615
3316             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3317                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3318
3319             info = {
3320                 'id': u'c' + chapter_id,
3321                 'url': video_url,
3322                 'ext': video_ext,
3323                 'title': chapter_info['title'],
3324                 'thumbnail': chapter_info['preview'],
3325                 'description': chapter_info['description'],
3326                 'uploader': chapter_info['channel']['display_name'],
3327                 'uploader_id': chapter_info['channel']['name'],
3328             }
3329             return [info]
3330         else:
3331             video_id = mobj.group('videoid')
3332             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3333
3334         self.report_extraction(video_id)
3335
3336         info = []
3337         offset = 0
3338         limit = self._JUSTIN_PAGE_LIMIT
3339         while True:
3340             if paged:
3341                 self.report_download_page(video_id, offset)
3342             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3343             page_count, page_info = self._parse_page(page_url, video_id)
3344             info.extend(page_info)
3345             if not paged or page_count != limit:
3346                 break
3347             offset += limit
3348         return info
3349
3350 class FunnyOrDieIE(InfoExtractor):
3351     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3352
3353     def _real_extract(self, url):
3354         mobj = re.match(self._VALID_URL, url)
3355         if mobj is None:
3356             raise ExtractorError(u'invalid URL: %s' % url)
3357
3358         video_id = mobj.group('id')
3359         webpage = self._download_webpage(url, video_id)
3360
3361         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3362             webpage, u'video URL', flags=re.DOTALL)
3363
3364         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3365             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3366
3367         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3368             webpage, u'description', fatal=False, flags=re.DOTALL)
3369
3370         info = {
3371             'id': video_id,
3372             'url': video_url,
3373             'ext': 'mp4',
3374             'title': title,
3375             'description': video_description,
3376         }
3377         return [info]
3378
3379 class SteamIE(InfoExtractor):
3380     _VALID_URL = r"""http://store\.steampowered\.com/
3381                 (agecheck/)?
3382                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3383                 (?P<gameID>\d+)/?
3384                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3385                 """
3386     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3387     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3388
3389     @classmethod
3390     def suitable(cls, url):
3391         """Receives a URL and returns True if suitable for this IE."""
3392         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3393
3394     def _real_extract(self, url):
3395         m = re.match(self._VALID_URL, url, re.VERBOSE)
3396         gameID = m.group('gameID')
3397
3398         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3399         webpage = self._download_webpage(videourl, gameID)
3400
3401         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3402             videourl = self._AGECHECK_TEMPLATE % gameID
3403             self.report_age_confirmation()
3404             webpage = self._download_webpage(videourl, gameID)
3405
3406         self.report_extraction(gameID)
3407         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3408                                              webpage, 'game title')
3409
3410         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3411         mweb = re.finditer(urlRE, webpage)
3412         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3413         titles = re.finditer(namesRE, webpage)
3414         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3415         thumbs = re.finditer(thumbsRE, webpage)
3416         videos = []
3417         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3418             video_id = vid.group('videoID')
3419             title = vtitle.group('videoName')
3420             video_url = vid.group('videoURL')
3421             video_thumb = thumb.group('thumbnail')
3422             if not video_url:
3423                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3424             info = {
3425                 'id':video_id,
3426                 'url':video_url,
3427                 'ext': 'flv',
3428                 'title': unescapeHTML(title),
3429                 'thumbnail': video_thumb
3430                   }
3431             videos.append(info)
3432         return [self.playlist_result(videos, gameID, game_title)]
3433
3434 class UstreamIE(InfoExtractor):
3435     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3436     IE_NAME = u'ustream'
3437
3438     def _real_extract(self, url):
3439         m = re.match(self._VALID_URL, url)
3440         video_id = m.group('videoID')
3441
3442         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3443         webpage = self._download_webpage(url, video_id)
3444
3445         self.report_extraction(video_id)
3446
3447         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3448             webpage, u'title')
3449
3450         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3451             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3452
3453         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3454             webpage, u'thumbnail', fatal=False)
3455
3456         info = {
3457                 'id': video_id,
3458                 'url': video_url,
3459                 'ext': 'flv',
3460                 'title': video_title,
3461                 'uploader': uploader,
3462                 'thumbnail': thumbnail,
3463                }
3464         return info
3465
3466 class WorldStarHipHopIE(InfoExtractor):
3467     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3468     IE_NAME = u'WorldStarHipHop'
3469
3470     def _real_extract(self, url):
3471         m = re.match(self._VALID_URL, url)
3472         video_id = m.group('id')
3473
3474         webpage_src = self._download_webpage(url, video_id)
3475
3476         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3477             webpage_src, u'video URL')
3478
3479         if 'mp4' in video_url:
3480             ext = 'mp4'
3481         else:
3482             ext = 'flv'
3483
3484         video_title = self._html_search_regex(r"<title>(.*)</title>",
3485             webpage_src, u'title')
3486
3487         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3488         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3489             webpage_src, u'thumbnail', fatal=False)
3490
3491         if not thumbnail:
3492             _title = r"""candytitles.*>(.*)</span>"""
3493             mobj = re.search(_title, webpage_src)
3494             if mobj is not None:
3495                 video_title = mobj.group(1)
3496
3497         results = [{
3498                     'id': video_id,
3499                     'url' : video_url,
3500                     'title' : video_title,
3501                     'thumbnail' : thumbnail,
3502                     'ext' : ext,
3503                     }]
3504         return results
3505
3506 class RBMARadioIE(InfoExtractor):
3507     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3508
3509     def _real_extract(self, url):
3510         m = re.match(self._VALID_URL, url)
3511         video_id = m.group('videoID')
3512
3513         webpage = self._download_webpage(url, video_id)
3514
3515         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3516             webpage, u'json data', flags=re.MULTILINE)
3517
3518         try:
3519             data = json.loads(json_data)
3520         except ValueError as e:
3521             raise ExtractorError(u'Invalid JSON: ' + str(e))
3522
3523         video_url = data['akamai_url'] + '&cbr=256'
3524         url_parts = compat_urllib_parse_urlparse(video_url)
3525         video_ext = url_parts.path.rpartition('.')[2]
3526         info = {
3527                 'id': video_id,
3528                 'url': video_url,
3529                 'ext': video_ext,
3530                 'title': data['title'],
3531                 'description': data.get('teaser_text'),
3532                 'location': data.get('country_of_origin'),
3533                 'uploader': data.get('host', {}).get('name'),
3534                 'uploader_id': data.get('host', {}).get('slug'),
3535                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3536                 'duration': data.get('duration'),
3537         }
3538         return [info]
3539
3540
3541 class YouPornIE(InfoExtractor):
3542     """Information extractor for youporn.com."""
3543     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3544
3545     def _print_formats(self, formats):
3546         """Print all available formats"""
3547         print(u'Available formats:')
3548         print(u'ext\t\tformat')
3549         print(u'---------------------------------')
3550         for format in formats:
3551             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3552
3553     def _specific(self, req_format, formats):
3554         for x in formats:
3555             if(x["format"]==req_format):
3556                 return x
3557         return None
3558
3559     def _real_extract(self, url):
3560         mobj = re.match(self._VALID_URL, url)
3561         if mobj is None:
3562             raise ExtractorError(u'Invalid URL: %s' % url)
3563         video_id = mobj.group('videoid')
3564
3565         req = compat_urllib_request.Request(url)
3566         req.add_header('Cookie', 'age_verified=1')
3567         webpage = self._download_webpage(req, video_id)
3568
3569         # Get JSON parameters
3570         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3571         try:
3572             params = json.loads(json_params)
3573         except:
3574             raise ExtractorError(u'Invalid JSON')
3575
3576         self.report_extraction(video_id)
3577         try:
3578             video_title = params['title']
3579             upload_date = unified_strdate(params['release_date_f'])
3580             video_description = params['description']
3581             video_uploader = params['submitted_by']
3582             thumbnail = params['thumbnails'][0]['image']
3583         except KeyError:
3584             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3585
3586         # Get all of the formats available
3587         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3588         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3589             webpage, u'download list').strip()
3590
3591         # Get all of the links from the page
3592         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3593         links = re.findall(LINK_RE, download_list_html)
3594         if(len(links) == 0):
3595             raise ExtractorError(u'ERROR: no known formats available for video')
3596
3597         self.to_screen(u'Links found: %d' % len(links))
3598
3599         formats = []
3600         for link in links:
3601
3602             # A link looks like this:
3603             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3604             # A path looks like this:
3605             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3606             video_url = unescapeHTML( link )
3607             path = compat_urllib_parse_urlparse( video_url ).path
3608             extension = os.path.splitext( path )[1][1:]
3609             format = path.split('/')[4].split('_')[:2]
3610             size = format[0]
3611             bitrate = format[1]
3612             format = "-".join( format )
3613             # title = u'%s-%s-%s' % (video_title, size, bitrate)
3614
3615             formats.append({
3616                 'id': video_id,
3617                 'url': video_url,
3618                 'uploader': video_uploader,
3619                 'upload_date': upload_date,
3620                 'title': video_title,
3621                 'ext': extension,
3622                 'format': format,
3623                 'thumbnail': thumbnail,
3624                 'description': video_description
3625             })
3626
3627         if self._downloader.params.get('listformats', None):
3628             self._print_formats(formats)
3629             return
3630
3631         req_format = self._downloader.params.get('format', None)
3632         self.to_screen(u'Format: %s' % req_format)
3633
3634         if req_format is None or req_format == 'best':
3635             return [formats[0]]
3636         elif req_format == 'worst':
3637             return [formats[-1]]
3638         elif req_format in ('-1', 'all'):
3639             return formats
3640         else:
3641             format = self._specific( req_format, formats )
3642             if result is None:
3643                 raise ExtractorError(u'Requested format not available')
3644             return [format]
3645
3646
3647
3648 class PornotubeIE(InfoExtractor):
3649     """Information extractor for pornotube.com."""
3650     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3651
3652     def _real_extract(self, url):
3653         mobj = re.match(self._VALID_URL, url)
3654         if mobj is None:
3655             raise ExtractorError(u'Invalid URL: %s' % url)
3656
3657         video_id = mobj.group('videoid')
3658         video_title = mobj.group('title')
3659
3660         # Get webpage content
3661         webpage = self._download_webpage(url, video_id)
3662
3663         # Get the video URL
3664         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3665         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3666         video_url = compat_urllib_parse.unquote(video_url)
3667
3668         #Get the uploaded date
3669         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3670         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3671         if upload_date: upload_date = unified_strdate(upload_date)
3672
3673         info = {'id': video_id,
3674                 'url': video_url,
3675                 'uploader': None,
3676                 'upload_date': upload_date,
3677                 'title': video_title,
3678                 'ext': 'flv',
3679                 'format': 'flv'}
3680
3681         return [info]
3682
3683 class YouJizzIE(InfoExtractor):
3684     """Information extractor for youjizz.com."""
3685     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3686
3687     def _real_extract(self, url):
3688         mobj = re.match(self._VALID_URL, url)
3689         if mobj is None:
3690             raise ExtractorError(u'Invalid URL: %s' % url)
3691
3692         video_id = mobj.group('videoid')
3693
3694         # Get webpage content
3695         webpage = self._download_webpage(url, video_id)
3696
3697         # Get the video title
3698         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3699             webpage, u'title').strip()
3700
3701         # Get the embed page
3702         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3703         if result is None:
3704             raise ExtractorError(u'ERROR: unable to extract embed page')
3705
3706         embed_page_url = result.group(0).strip()
3707         video_id = result.group('videoid')
3708
3709         webpage = self._download_webpage(embed_page_url, video_id)
3710
3711         # Get the video URL
3712         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3713             webpage, u'video URL')
3714
3715         info = {'id': video_id,
3716                 'url': video_url,
3717                 'title': video_title,
3718                 'ext': 'flv',
3719                 'format': 'flv',
3720                 'player_url': embed_page_url}
3721
3722         return [info]
3723
3724 class EightTracksIE(InfoExtractor):
3725     IE_NAME = '8tracks'
3726     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3727
3728     def _real_extract(self, url):
3729         mobj = re.match(self._VALID_URL, url)
3730         if mobj is None:
3731             raise ExtractorError(u'Invalid URL: %s' % url)
3732         playlist_id = mobj.group('id')
3733
3734         webpage = self._download_webpage(url, playlist_id)
3735
3736         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3737         data = json.loads(json_like)
3738
3739         session = str(random.randint(0, 1000000000))
3740         mix_id = data['id']
3741         track_count = data['tracks_count']
3742         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3743         next_url = first_url
3744         res = []
3745         for i in itertools.count():
3746             api_json = self._download_webpage(next_url, playlist_id,
3747                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3748                 errnote=u'Failed to download song information')
3749             api_data = json.loads(api_json)
3750             track_data = api_data[u'set']['track']
3751             info = {
3752                 'id': track_data['id'],
3753                 'url': track_data['track_file_stream_url'],
3754                 'title': track_data['performer'] + u' - ' + track_data['name'],
3755                 'raw_title': track_data['name'],
3756                 'uploader_id': data['user']['login'],
3757                 'ext': 'm4a',
3758             }
3759             res.append(info)
3760             if api_data['set']['at_last_track']:
3761                 break
3762             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3763         return res
3764
3765 class KeekIE(InfoExtractor):
3766     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3767     IE_NAME = u'keek'
3768
3769     def _real_extract(self, url):
3770         m = re.match(self._VALID_URL, url)
3771         video_id = m.group('videoID')
3772
3773         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3774         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3775         webpage = self._download_webpage(url, video_id)
3776
3777         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3778             webpage, u'title')
3779
3780         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3781             webpage, u'uploader', fatal=False)
3782
3783         info = {
3784                 'id': video_id,
3785                 'url': video_url,
3786                 'ext': 'mp4',
3787                 'title': video_title,
3788                 'thumbnail': thumbnail,
3789                 'uploader': uploader
3790         }
3791         return [info]
3792
3793 class TEDIE(InfoExtractor):
3794     _VALID_URL=r'''http://www\.ted\.com/
3795                    (
3796                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3797                         |
3798                         ((?P<type_talk>talks)) # We have a simple talk
3799                    )
3800                    (/lang/(.*?))? # The url may contain the language
3801                    /(?P<name>\w+) # Here goes the name and then ".html"
3802                    '''
3803
3804     @classmethod
3805     def suitable(cls, url):
3806         """Receives a URL and returns True if suitable for this IE."""
3807         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3808
3809     def _real_extract(self, url):
3810         m=re.match(self._VALID_URL, url, re.VERBOSE)
3811         if m.group('type_talk'):
3812             return [self._talk_info(url)]
3813         else :
3814             playlist_id=m.group('playlist_id')
3815             name=m.group('name')
3816             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3817             return [self._playlist_videos_info(url,name,playlist_id)]
3818
3819     def _playlist_videos_info(self,url,name,playlist_id=0):
3820         '''Returns the videos of the playlist'''
3821         video_RE=r'''
3822                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3823                      ([.\s]*?)data-playlist_item_id="(\d+)"
3824                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3825                      '''
3826         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3827         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3828         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3829         m_names=re.finditer(video_name_RE,webpage)
3830
3831         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3832                                                  webpage, 'playlist title')
3833
3834         playlist_entries = []
3835         for m_video, m_name in zip(m_videos,m_names):
3836             video_id=m_video.group('video_id')
3837             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3838             playlist_entries.append(self.url_result(talk_url, 'TED'))
3839         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3840
3841     def _talk_info(self, url, video_id=0):
3842         """Return the video for the talk in the url"""
3843         m = re.match(self._VALID_URL, url,re.VERBOSE)
3844         video_name = m.group('name')
3845         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3846         self.report_extraction(video_name)
3847         # If the url includes the language we get the title translated
3848         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3849                                         webpage, 'title')
3850         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3851                                     webpage, 'json data')
3852         info = json.loads(json_data)
3853         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3854                                        webpage, 'description', flags = re.DOTALL)
3855
3856         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3857                                        webpage, 'thumbnail')
3858         info = {
3859                 'id': info['id'],
3860                 'url': info['htmlStreams'][-1]['file'],
3861                 'ext': 'mp4',
3862                 'title': title,
3863                 'thumbnail': thumbnail,
3864                 'description': desc,
3865                 }
3866         return info
3867
3868 class MySpassIE(InfoExtractor):
3869     _VALID_URL = r'http://www.myspass.de/.*'
3870
3871     def _real_extract(self, url):
3872         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3873
3874         # video id is the last path element of the URL
3875         # usually there is a trailing slash, so also try the second but last
3876         url_path = compat_urllib_parse_urlparse(url).path
3877         url_parent_path, video_id = os.path.split(url_path)
3878         if not video_id:
3879             _, video_id = os.path.split(url_parent_path)
3880
3881         # get metadata
3882         metadata_url = META_DATA_URL_TEMPLATE % video_id
3883         metadata_text = self._download_webpage(metadata_url, video_id)
3884         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3885
3886         # extract values from metadata
3887         url_flv_el = metadata.find('url_flv')
3888         if url_flv_el is None:
3889             raise ExtractorError(u'Unable to extract download url')
3890         video_url = url_flv_el.text
3891         extension = os.path.splitext(video_url)[1][1:]
3892         title_el = metadata.find('title')
3893         if title_el is None:
3894             raise ExtractorError(u'Unable to extract title')
3895         title = title_el.text
3896         format_id_el = metadata.find('format_id')
3897         if format_id_el is None:
3898             format = ext
3899         else:
3900             format = format_id_el.text
3901         description_el = metadata.find('description')
3902         if description_el is not None:
3903             description = description_el.text
3904         else:
3905             description = None
3906         imagePreview_el = metadata.find('imagePreview')
3907         if imagePreview_el is not None:
3908             thumbnail = imagePreview_el.text
3909         else:
3910             thumbnail = None
3911         info = {
3912             'id': video_id,
3913             'url': video_url,
3914             'title': title,
3915             'ext': extension,
3916             'format': format,
3917             'thumbnail': thumbnail,
3918             'description': description
3919         }
3920         return [info]
3921
3922 class SpiegelIE(InfoExtractor):
3923     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3924
3925     def _real_extract(self, url):
3926         m = re.match(self._VALID_URL, url)
3927         video_id = m.group('videoID')
3928
3929         webpage = self._download_webpage(url, video_id)
3930
3931         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3932             webpage, u'title')
3933
3934         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3935         xml_code = self._download_webpage(xml_url, video_id,
3936                     note=u'Downloading XML', errnote=u'Failed to download XML')
3937
3938         idoc = xml.etree.ElementTree.fromstring(xml_code)
3939         last_type = idoc[-1]
3940         filename = last_type.findall('./filename')[0].text
3941         duration = float(last_type.findall('./duration')[0].text)
3942
3943         video_url = 'http://video2.spiegel.de/flash/' + filename
3944         video_ext = filename.rpartition('.')[2]
3945         info = {
3946             'id': video_id,
3947             'url': video_url,
3948             'ext': video_ext,
3949             'title': video_title,
3950             'duration': duration,
3951         }
3952         return [info]
3953
3954 class LiveLeakIE(InfoExtractor):
3955
3956     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3957     IE_NAME = u'liveleak'
3958
3959     def _real_extract(self, url):
3960         mobj = re.match(self._VALID_URL, url)
3961         if mobj is None:
3962             raise ExtractorError(u'Invalid URL: %s' % url)
3963
3964         video_id = mobj.group('video_id')
3965
3966         webpage = self._download_webpage(url, video_id)
3967
3968         video_url = self._search_regex(r'file: "(.*?)",',
3969             webpage, u'video URL')
3970
3971         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3972             webpage, u'title').replace('LiveLeak.com -', '').strip()
3973
3974         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3975             webpage, u'description', fatal=False)
3976
3977         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3978             webpage, u'uploader', fatal=False)
3979
3980         info = {
3981             'id':  video_id,
3982             'url': video_url,
3983             'ext': 'mp4',
3984             'title': video_title,
3985             'description': video_description,
3986             'uploader': video_uploader
3987         }
3988
3989         return [info]
3990
3991 class ARDIE(InfoExtractor):
3992     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3993     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3994     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3995
3996     def _real_extract(self, url):
3997         # determine video id from url
3998         m = re.match(self._VALID_URL, url)
3999
4000         numid = re.search(r'documentId=([0-9]+)', url)
4001         if numid:
4002             video_id = numid.group(1)
4003         else:
4004             video_id = m.group('video_id')
4005
4006         # determine title and media streams from webpage
4007         html = self._download_webpage(url, video_id)
4008         title = re.search(self._TITLE, html).group('title')
4009         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4010         if not streams:
4011             assert '"fsk"' in html
4012             raise ExtractorError(u'This video is only available after 8:00 pm')
4013
4014         # choose default media type and highest quality for now
4015         stream = max([s for s in streams if int(s["media_type"]) == 0],
4016                      key=lambda s: int(s["quality"]))
4017
4018         # there's two possibilities: RTMP stream or HTTP download
4019         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4020         if stream['rtmp_url']:
4021             self.to_screen(u'RTMP download detected')
4022             assert stream['video_url'].startswith('mp4:')
4023             info["url"] = stream["rtmp_url"]
4024             info["play_path"] = stream['video_url']
4025         else:
4026             assert stream["video_url"].endswith('.mp4')
4027             info["url"] = stream["video_url"]
4028         return [info]
4029
4030 class ZDFIE(InfoExtractor):
4031     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4032     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4033     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4034     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4035     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4036
4037     def _real_extract(self, url):
4038         mobj = re.match(self._VALID_URL, url)
4039         if mobj is None:
4040             raise ExtractorError(u'Invalid URL: %s' % url)
4041         video_id = mobj.group('video_id')
4042
4043         html = self._download_webpage(url, video_id)
4044         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4045         if streams is None:
4046             raise ExtractorError(u'No media url found.')
4047
4048         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4049         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4050         # choose first/default media type and highest quality for now
4051         for s in streams:        #find 300 - dsl1000mbit
4052             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4053                 stream_=s
4054                 break
4055         for s in streams:        #find veryhigh - dsl2000mbit
4056             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4057                 stream_=s
4058                 break
4059         if stream_ is None:
4060             raise ExtractorError(u'No stream found.')
4061
4062         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4063
4064         self.report_extraction(video_id)
4065         mobj = re.search(self._TITLE, html)
4066         if mobj is None:
4067             raise ExtractorError(u'Cannot extract title')
4068         title = unescapeHTML(mobj.group('title'))
4069
4070         mobj = re.search(self._MMS_STREAM, media_link)
4071         if mobj is None:
4072             mobj = re.search(self._RTSP_STREAM, media_link)
4073             if mobj is None:
4074                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4075         mms_url = mobj.group('video_url')
4076
4077         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4078         if mobj is None:
4079             raise ExtractorError(u'Cannot extract extention')
4080         ext = mobj.group('ext')
4081
4082         return [{'id': video_id,
4083                  'url': mms_url,
4084                  'title': title,
4085                  'ext': ext
4086                  }]
4087
4088 class TumblrIE(InfoExtractor):
4089     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4090
4091     def _real_extract(self, url):
4092         m_url = re.match(self._VALID_URL, url)
4093         video_id = m_url.group('id')
4094         blog = m_url.group('blog_name')
4095
4096         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4097         webpage = self._download_webpage(url, video_id)
4098
4099         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4100         video = re.search(re_video, webpage)
4101         if video is None:
4102            raise ExtractorError(u'Unable to extract video')
4103         video_url = video.group('video_url')
4104         ext = video.group('ext')
4105
4106         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4107             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4108         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4109
4110         # The only place where you can get a title, it's not complete,
4111         # but searching in other places doesn't work for all videos
4112         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4113             webpage, u'title', flags=re.DOTALL)
4114
4115         return [{'id': video_id,
4116                  'url': video_url,
4117                  'title': video_title,
4118                  'thumbnail': video_thumbnail,
4119                  'ext': ext
4120                  }]
4121
4122 class BandcampIE(InfoExtractor):
4123     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4124
4125     def _real_extract(self, url):
4126         mobj = re.match(self._VALID_URL, url)
4127         title = mobj.group('title')
4128         webpage = self._download_webpage(url, title)
4129         # We get the link to the free download page
4130         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4131         if m_download is None:
4132             raise ExtractorError(u'No free songs found')
4133
4134         download_link = m_download.group(1)
4135         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4136                        webpage, re.MULTILINE|re.DOTALL).group('id')
4137
4138         download_webpage = self._download_webpage(download_link, id,
4139                                                   'Downloading free downloads page')
4140         # We get the dictionary of the track from some javascrip code
4141         info = re.search(r'items: (.*?),$',
4142                          download_webpage, re.MULTILINE).group(1)
4143         info = json.loads(info)[0]
4144         # We pick mp3-320 for now, until format selection can be easily implemented.
4145         mp3_info = info[u'downloads'][u'mp3-320']
4146         # If we try to use this url it says the link has expired
4147         initial_url = mp3_info[u'url']
4148         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4149         m_url = re.match(re_url, initial_url)
4150         #We build the url we will use to get the final track url
4151         # This url is build in Bandcamp in the script download_bunde_*.js
4152         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4153         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4154         # If we could correctly generate the .rand field the url would be
4155         #in the "download_url" key
4156         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4157
4158         track_info = {'id':id,
4159                       'title' : info[u'title'],
4160                       'ext' :   'mp3',
4161                       'url' :   final_url,
4162                       'thumbnail' : info[u'thumb_url'],
4163                       'uploader' :  info[u'artist']
4164                       }
4165
4166         return [track_info]
4167
4168 class RedTubeIE(InfoExtractor):
4169     """Information Extractor for redtube"""
4170     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4171
4172     def _real_extract(self,url):
4173         mobj = re.match(self._VALID_URL, url)
4174         if mobj is None:
4175             raise ExtractorError(u'Invalid URL: %s' % url)
4176
4177         video_id = mobj.group('id')
4178         video_extension = 'mp4'
4179         webpage = self._download_webpage(url, video_id)
4180
4181         self.report_extraction(video_id)
4182
4183         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4184             webpage, u'video URL')
4185
4186         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4187             webpage, u'title')
4188
4189         return [{
4190             'id':       video_id,
4191             'url':      video_url,
4192             'ext':      video_extension,
4193             'title':    video_title,
4194         }]
4195
4196 class InaIE(InfoExtractor):
4197     """Information Extractor for Ina.fr"""
4198     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4199
4200     def _real_extract(self,url):
4201         mobj = re.match(self._VALID_URL, url)
4202
4203         video_id = mobj.group('id')
4204         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4205         video_extension = 'mp4'
4206         webpage = self._download_webpage(mrss_url, video_id)
4207
4208         self.report_extraction(video_id)
4209
4210         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4211             webpage, u'video URL')
4212
4213         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4214             webpage, u'title')
4215
4216         return [{
4217             'id':       video_id,
4218             'url':      video_url,
4219             'ext':      video_extension,
4220             'title':    video_title,
4221         }]
4222
4223 class HowcastIE(InfoExtractor):
4224     """Information Extractor for Howcast.com"""
4225     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4226
4227     def _real_extract(self, url):
4228         mobj = re.match(self._VALID_URL, url)
4229
4230         video_id = mobj.group('id')
4231         webpage_url = 'http://www.howcast.com/videos/' + video_id
4232         webpage = self._download_webpage(webpage_url, video_id)
4233
4234         self.report_extraction(video_id)
4235
4236         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4237             webpage, u'video URL')
4238
4239         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4240             webpage, u'title')
4241
4242         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4243             webpage, u'description', fatal=False)
4244
4245         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4246             webpage, u'thumbnail', fatal=False)
4247
4248         return [{
4249             'id':       video_id,
4250             'url':      video_url,
4251             'ext':      'mp4',
4252             'title':    video_title,
4253             'description': video_description,
4254             'thumbnail': thumbnail,
4255         }]
4256
4257 class VineIE(InfoExtractor):
4258     """Information Extractor for Vine.co"""
4259     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4260
4261     def _real_extract(self, url):
4262         mobj = re.match(self._VALID_URL, url)
4263
4264         video_id = mobj.group('id')
4265         webpage_url = 'https://vine.co/v/' + video_id
4266         webpage = self._download_webpage(webpage_url, video_id)
4267
4268         self.report_extraction(video_id)
4269
4270         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4271             webpage, u'video URL')
4272
4273         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4274             webpage, u'title')
4275
4276         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4277             webpage, u'thumbnail', fatal=False)
4278
4279         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4280             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4281
4282         return [{
4283             'id':        video_id,
4284             'url':       video_url,
4285             'ext':       'mp4',
4286             'title':     video_title,
4287             'thumbnail': thumbnail,
4288             'uploader':  uploader,
4289         }]
4290
4291 class FlickrIE(InfoExtractor):
4292     """Information Extractor for Flickr videos"""
4293     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4294
4295     def _real_extract(self, url):
4296         mobj = re.match(self._VALID_URL, url)
4297
4298         video_id = mobj.group('id')
4299         video_uploader_id = mobj.group('uploader_id')
4300         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4301         webpage = self._download_webpage(webpage_url, video_id)
4302
4303         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4304
4305         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4306         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4307
4308         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4309             first_xml, u'node_id')
4310
4311         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4312         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4313
4314         self.report_extraction(video_id)
4315
4316         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4317         if mobj is None:
4318             raise ExtractorError(u'Unable to extract video url')
4319         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4320
4321         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4322             webpage, u'video title')
4323
4324         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4325             webpage, u'description', fatal=False)
4326
4327         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4328             webpage, u'thumbnail', fatal=False)
4329
4330         return [{
4331             'id':          video_id,
4332             'url':         video_url,
4333             'ext':         'mp4',
4334             'title':       video_title,
4335             'description': video_description,
4336             'thumbnail':   thumbnail,
4337             'uploader_id': video_uploader_id,
4338         }]
4339
4340 class TeamcocoIE(InfoExtractor):
4341     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4342
4343     def _real_extract(self, url):
4344         mobj = re.match(self._VALID_URL, url)
4345         if mobj is None:
4346             raise ExtractorError(u'Invalid URL: %s' % url)
4347         url_title = mobj.group('url_title')
4348         webpage = self._download_webpage(url, url_title)
4349
4350         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4351             webpage, u'video id')
4352
4353         self.report_extraction(video_id)
4354
4355         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4356             webpage, u'title')
4357
4358         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4359             webpage, u'thumbnail', fatal=False)
4360
4361         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4362             webpage, u'description', fatal=False)
4363
4364         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4365         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4366
4367         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4368             data, u'video URL')
4369
4370         return [{
4371             'id':          video_id,
4372             'url':         video_url,
4373             'ext':         'mp4',
4374             'title':       video_title,
4375             'thumbnail':   thumbnail,
4376             'description': video_description,
4377         }]
4378
4379 class XHamsterIE(InfoExtractor):
4380     """Information Extractor for xHamster"""
4381     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4382
4383     def _real_extract(self,url):
4384         mobj = re.match(self._VALID_URL, url)
4385
4386         video_id = mobj.group('id')
4387         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4388         webpage = self._download_webpage(mrss_url, video_id)
4389
4390         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4391         if mobj is None:
4392             raise ExtractorError(u'Unable to extract media URL')
4393         if len(mobj.group('server')) == 0:
4394             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4395         else:
4396             video_url = mobj.group('server')+'/key='+mobj.group('file')
4397         video_extension = video_url.split('.')[-1]
4398
4399         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4400             webpage, u'title')
4401
4402         # Can't see the description anywhere in the UI
4403         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4404         #     webpage, u'description', fatal=False)
4405         # if video_description: video_description = unescapeHTML(video_description)
4406
4407         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4408         if mobj:
4409             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4410         else:
4411             video_upload_date = None
4412             self._downloader.report_warning(u'Unable to extract upload date')
4413
4414         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4415             webpage, u'uploader id', default=u'anonymous')
4416
4417         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4418             webpage, u'thumbnail', fatal=False)
4419
4420         return [{
4421             'id':       video_id,
4422             'url':      video_url,
4423             'ext':      video_extension,
4424             'title':    video_title,
4425             # 'description': video_description,
4426             'upload_date': video_upload_date,
4427             'uploader_id': video_uploader_id,
4428             'thumbnail': video_thumbnail
4429         }]
4430
4431 class HypemIE(InfoExtractor):
4432     """Information Extractor for hypem"""
4433     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4434
4435     def _real_extract(self, url):
4436         mobj = re.match(self._VALID_URL, url)
4437         if mobj is None:
4438             raise ExtractorError(u'Invalid URL: %s' % url)
4439         track_id = mobj.group(1)
4440
4441         data = { 'ax': 1, 'ts': time.time() }
4442         data_encoded = compat_urllib_parse.urlencode(data)
4443         complete_url = url + "?" + data_encoded
4444         request = compat_urllib_request.Request(complete_url)
4445         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4446         cookie = urlh.headers.get('Set-Cookie', '')
4447
4448         self.report_extraction(track_id)
4449
4450         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4451             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4452         try:
4453             track_list = json.loads(html_tracks)
4454             track = track_list[u'tracks'][0]
4455         except ValueError:
4456             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4457
4458         key = track[u"key"]
4459         track_id = track[u"id"]
4460         artist = track[u"artist"]
4461         title = track[u"song"]
4462
4463         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4464         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4465         request.add_header('cookie', cookie)
4466         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4467         try:
4468             song_data = json.loads(song_data_json)
4469         except ValueError:
4470             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4471         final_url = song_data[u"url"]
4472
4473         return [{
4474             'id':       track_id,
4475             'url':      final_url,
4476             'ext':      "mp3",
4477             'title':    title,
4478             'artist':   artist,
4479         }]
4480
4481 class Vbox7IE(InfoExtractor):
4482     """Information Extractor for Vbox7"""
4483     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4484
4485     def _real_extract(self,url):
4486         mobj = re.match(self._VALID_URL, url)
4487         if mobj is None:
4488             raise ExtractorError(u'Invalid URL: %s' % url)
4489         video_id = mobj.group(1)
4490
4491         redirect_page, urlh = self._download_webpage_handle(url, video_id)
4492         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4493         redirect_url = urlh.geturl() + new_location
4494         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4495
4496         title = self._html_search_regex(r'<title>(.*)</title>',
4497             webpage, u'title').split('/')[0].strip()
4498
4499         ext = "flv"
4500         info_url = "http://vbox7.com/play/magare.do"
4501         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4502         info_request = compat_urllib_request.Request(info_url, data)
4503         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4504         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4505         if info_response is None:
4506             raise ExtractorError(u'Unable to extract the media url')
4507         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4508
4509         return [{
4510             'id':        video_id,
4511             'url':       final_url,
4512             'ext':       ext,
4513             'title':     title,
4514             'thumbnail': thumbnail_url,
4515         }]
4516
4517 class GametrailersIE(InfoExtractor):
4518     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4519
4520     def _real_extract(self, url):
4521         mobj = re.match(self._VALID_URL, url)
4522         if mobj is None:
4523             raise ExtractorError(u'Invalid URL: %s' % url)
4524         video_id = mobj.group('id')
4525         video_type = mobj.group('type')
4526         webpage = self._download_webpage(url, video_id)
4527         if video_type == 'full-episodes':
4528             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4529         else:
4530             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4531         mgid = self._search_regex(mgid_re, webpage, u'mgid')
4532         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4533
4534         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4535                                            video_id, u'Downloading video info')
4536         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4537                                                video_id, u'Downloading video urls info')
4538
4539         self.report_extraction(video_id)
4540         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4541                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4542                       <image>.*
4543                         <url>(?P<thumb>.*?)</url>.*
4544                       </image>'''
4545
4546         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4547         if m_info is None:
4548             raise ExtractorError(u'Unable to extract video info')
4549         video_title = m_info.group('title')
4550         video_description = m_info.group('description')
4551         video_thumb = m_info.group('thumb')
4552
4553         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4554         if m_urls is None or len(m_urls) == 0:
4555             raise ExtractError(u'Unable to extrat video url')
4556         # They are sorted from worst to best quality
4557         video_url = m_urls[-1].group('url')
4558
4559         return {'url':         video_url,
4560                 'id':          video_id,
4561                 'title':       video_title,
4562                 # Videos are actually flv not mp4
4563                 'ext':         'flv',
4564                 'thumbnail':   video_thumb,
4565                 'description': video_description,
4566                 }
4567
4568 def gen_extractors():
4569     """ Return a list of an instance of every supported extractor.
4570     The order does matter; the first extractor matched is the one handling the URL.
4571     """
4572     return [
4573         YoutubePlaylistIE(),
4574         YoutubeChannelIE(),
4575         YoutubeUserIE(),
4576         YoutubeSearchIE(),
4577         YoutubeIE(),
4578         MetacafeIE(),
4579         DailymotionIE(),
4580         GoogleSearchIE(),
4581         PhotobucketIE(),
4582         YahooIE(),
4583         YahooSearchIE(),
4584         DepositFilesIE(),
4585         FacebookIE(),
4586         BlipTVIE(),
4587         BlipTVUserIE(),
4588         VimeoIE(),
4589         MyVideoIE(),
4590         ComedyCentralIE(),
4591         EscapistIE(),
4592         CollegeHumorIE(),
4593         XVideosIE(),
4594         SoundcloudSetIE(),
4595         SoundcloudIE(),
4596         InfoQIE(),
4597         MixcloudIE(),
4598         StanfordOpenClassroomIE(),
4599         MTVIE(),
4600         YoukuIE(),
4601         XNXXIE(),
4602         YouJizzIE(),
4603         PornotubeIE(),
4604         YouPornIE(),
4605         GooglePlusIE(),
4606         ArteTvIE(),
4607         NBAIE(),
4608         WorldStarHipHopIE(),
4609         JustinTVIE(),
4610         FunnyOrDieIE(),
4611         SteamIE(),
4612         UstreamIE(),
4613         RBMARadioIE(),
4614         EightTracksIE(),
4615         KeekIE(),
4616         TEDIE(),
4617         MySpassIE(),
4618         SpiegelIE(),
4619         LiveLeakIE(),
4620         ARDIE(),
4621         ZDFIE(),
4622         TumblrIE(),
4623         BandcampIE(),
4624         RedTubeIE(),
4625         InaIE(),
4626         HowcastIE(),
4627         VineIE(),
4628         FlickrIE(),
4629         TeamcocoIE(),
4630         XHamsterIE(),
4631         HypemIE(),
4632         Vbox7IE(),
4633         GametrailersIE(),
4634         GenericIE()
4635     ]
4636
4637 def get_info_extractor(ie_name):
4638     """Returns the info extractor class with the given ie_name"""
4639     return globals()[ie_name+'IE']