youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 195         """
 196         Perform a regex search on the given string, using a single or a list of
 197         patterns returning the first matching group.
 198         In case of failure return a default value or raise a WARNING or a
 199         ExtractorError, depending on fatal, specifying the field name.
 200         """
 201         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 202             mobj = re.search(pattern, string, flags)
 203         else:
 204             for p in pattern:
 205                 mobj = re.search(p, string, flags)
 206                 if mobj: break
 207
 208         if sys.stderr.isatty() and os.name != 'nt':
 209             _name = u'\033[0;34m%s\033[0m' % name
 210         else:
 211             _name = name
 212
 213         if mobj:
 214             # return the first matching group
 215             return next(g for g in mobj.groups() if g is not None)
 216         elif default is not None:
 217             return default
 218         elif fatal:
 219             raise ExtractorError(u'Unable to extract %s' % _name)
 220         else:
 221             self._downloader.report_warning(u'unable to extract %s; '
 222                 u'please report this issue on GitHub.' % _name)
 223             return None
 224
 225     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 226         """
 227         Like _search_regex, but strips HTML tags and unescapes entities.
 228         """
 229         res = self._search_regex(pattern, string, name, default, fatal, flags)
 230         if res:
 231             return clean_html(res).strip()
 232         else:
 233             return res
 234
 235 class SearchInfoExtractor(InfoExtractor):
 236     """
 237     Base class for paged search queries extractors.
 238     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 239     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 240     """
 241
 242     @classmethod
 243     def _make_valid_url(cls):
 244         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 245
 246     @classmethod
 247     def suitable(cls, url):
 248         return re.match(cls._make_valid_url(), url) is not None
 249
 250     def _real_extract(self, query):
 251         mobj = re.match(self._make_valid_url(), query)
 252         if mobj is None:
 253             raise ExtractorError(u'Invalid search query "%s"' % query)
 254
 255         prefix = mobj.group('prefix')
 256         query = mobj.group('query')
 257         if prefix == '':
 258             return self._get_n_results(query, 1)
 259         elif prefix == 'all':
 260             return self._get_n_results(query, self._MAX_RESULTS)
 261         else:
 262             n = int(prefix)
 263             if n <= 0:
 264                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 265             elif n > self._MAX_RESULTS:
 266                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 267                 n = self._MAX_RESULTS
 268             return self._get_n_results(query, n)
 269
 270     def _get_n_results(self, query, n):
 271         """Get a specified number of results for a query"""
 272         raise NotImplementedError("This method must be implemented by sublclasses")
 273
 274
 275 class YoutubeIE(InfoExtractor):
 276     """Information extractor for youtube.com."""
 277
 278     _VALID_URL = r"""^
 279                      (
 280                          (?:https?://)?                                       # http(s):// (optional)
 281                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 282                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 283                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 284                          (?:                                                  # the various things that can precede the ID:
 285                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 286                              |(?:                                             # or the v= param in all its forms
 287                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 288                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 289                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 290                                  v=
 291                              )
 292                          )?                                                   # optional -> youtube.com/xxxx is OK
 293                      )?                                                       # all until now is optional -> you can pass the naked ID
 294                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 295                      (?(1).+)?                                                # if we found the ID, everything can follow
 296                      $"""
 297     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 298     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 299     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 300     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 301     _NETRC_MACHINE = 'youtube'
 302     # Listed in order of quality
 303     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 304     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 305     _video_extensions = {
 306         '13': '3gp',
 307         '17': 'mp4',
 308         '18': 'mp4',
 309         '22': 'mp4',
 310         '37': 'mp4',
 311         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 312         '43': 'webm',
 313         '44': 'webm',
 314         '45': 'webm',
 315         '46': 'webm',
 316     }
 317     _video_dimensions = {
 318         '5': '240x400',
 319         '6': '???',
 320         '13': '???',
 321         '17': '144x176',
 322         '18': '360x640',
 323         '22': '720x1280',
 324         '34': '360x640',
 325         '35': '480x854',
 326         '37': '1080x1920',
 327         '38': '3072x4096',
 328         '43': '360x640',
 329         '44': '480x854',
 330         '45': '720x1280',
 331         '46': '1080x1920',
 332     }
 333     IE_NAME = u'youtube'
 334
 335     @classmethod
 336     def suitable(cls, url):
 337         """Receives a URL and returns True if suitable for this IE."""
 338         if YoutubePlaylistIE.suitable(url): return False
 339         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 340
 341     def report_lang(self):
 342         """Report attempt to set language."""
 343         self.to_screen(u'Setting language')
 344
 345     def report_login(self):
 346         """Report attempt to log in."""
 347         self.to_screen(u'Logging in')
 348
 349     def report_video_webpage_download(self, video_id):
 350         """Report attempt to download video webpage."""
 351         self.to_screen(u'%s: Downloading video webpage' % video_id)
 352
 353     def report_video_info_webpage_download(self, video_id):
 354         """Report attempt to download video info webpage."""
 355         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 356
 357     def report_video_subtitles_download(self, video_id):
 358         """Report attempt to download video info webpage."""
 359         self.to_screen(u'%s: Checking available subtitles' % video_id)
 360
 361     def report_video_subtitles_request(self, video_id, sub_lang, format):
 362         """Report attempt to download video info webpage."""
 363         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 364
 365     def report_video_subtitles_available(self, video_id, sub_lang_list):
 366         """Report available subtitles."""
 367         sub_lang = ",".join(list(sub_lang_list.keys()))
 368         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 369
 370     def report_information_extraction(self, video_id):
 371         """Report attempt to extract video information."""
 372         self.to_screen(u'%s: Extracting video information' % video_id)
 373
 374     def report_unavailable_format(self, video_id, format):
 375         """Report extracted video URL."""
 376         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 377
 378     def report_rtmp_download(self):
 379         """Indicate the download will use the RTMP protocol."""
 380         self.to_screen(u'RTMP download detected')
 381
 382     def _get_available_subtitles(self, video_id):
 383         self.report_video_subtitles_download(video_id)
 384         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 385         try:
 386             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 388             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 389         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 390         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 391         if not sub_lang_list:
 392             return (u'video doesn\'t have subtitles', None)
 393         return sub_lang_list
 394
 395     def _list_available_subtitles(self, video_id):
 396         sub_lang_list = self._get_available_subtitles(video_id)
 397         self.report_video_subtitles_available(video_id, sub_lang_list)
 398
 399     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 400         """
 401         Return tuple:
 402         (error_message, sub_lang, sub)
 403         """
 404         self.report_video_subtitles_request(video_id, sub_lang, format)
 405         params = compat_urllib_parse.urlencode({
 406             'lang': sub_lang,
 407             'name': sub_name,
 408             'v': video_id,
 409             'fmt': format,
 410         })
 411         url = 'http://www.youtube.com/api/timedtext?' + params
 412         try:
 413             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 416         if not sub:
 417             return (u'Did not fetch video subtitles', None, None)
 418         return (None, sub_lang, sub)
 419
 420     def _request_automatic_caption(self, video_id, webpage):
 421         """We need the webpage for getting the captions url, pass it as an
 422            argument to speed up the process."""
 423         sub_lang = self._downloader.params.get('subtitleslang') or 'en'
 424         sub_format = self._downloader.params.get('subtitlesformat')
 425         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 426         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 427         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 428         if mobj is None:
 429             return [(err_msg, None, None)]
 430         player_config = json.loads(mobj.group(1))
 431         try:
 432             args = player_config[u'args']
 433             caption_url = args[u'ttsurl']
 434             timestamp = args[u'timestamp']
 435             params = compat_urllib_parse.urlencode({
 436                 'lang': 'en',
 437                 'tlang': sub_lang,
 438                 'fmt': sub_format,
 439                 'ts': timestamp,
 440                 'kind': 'asr',
 441             })
 442             subtitles_url = caption_url + '&' + params
 443             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 444             return [(None, sub_lang, sub)]
 445         except KeyError:
 446             return [(err_msg, None, None)]
 447
 448     def _extract_subtitle(self, video_id):
 449         """
 450         Return a list with a tuple:
 451         [(error_message, sub_lang, sub)]
 452         """
 453         sub_lang_list = self._get_available_subtitles(video_id)
 454         sub_format = self._downloader.params.get('subtitlesformat')
 455         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 456             return [(sub_lang_list[0], None, None)]
 457         if self._downloader.params.get('subtitleslang', False):
 458             sub_lang = self._downloader.params.get('subtitleslang')
 459         elif 'en' in sub_lang_list:
 460             sub_lang = 'en'
 461         else:
 462             sub_lang = list(sub_lang_list.keys())[0]
 463         if not sub_lang in sub_lang_list:
 464             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 465
 466         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 467         return [subtitle]
 468
 469     def _extract_all_subtitles(self, video_id):
 470         sub_lang_list = self._get_available_subtitles(video_id)
 471         sub_format = self._downloader.params.get('subtitlesformat')
 472         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 473             return [(sub_lang_list[0], None, None)]
 474         subtitles = []
 475         for sub_lang in sub_lang_list:
 476             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 477             subtitles.append(subtitle)
 478         return subtitles
 479
 480     def _print_formats(self, formats):
 481         print('Available formats:')
 482         for x in formats:
 483             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 484
 485     def _real_initialize(self):
 486         if self._downloader is None:
 487             return
 488
 489         username = None
 490         password = None
 491         downloader_params = self._downloader.params
 492
 493         # Attempt to use provided username and password or .netrc data
 494         if downloader_params.get('username', None) is not None:
 495             username = downloader_params['username']
 496             password = downloader_params['password']
 497         elif downloader_params.get('usenetrc', False):
 498             try:
 499                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 500                 if info is not None:
 501                     username = info[0]
 502                     password = info[2]
 503                 else:
 504                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 505             except (IOError, netrc.NetrcParseError) as err:
 506                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 507                 return
 508
 509         # Set language
 510         request = compat_urllib_request.Request(self._LANG_URL)
 511         try:
 512             self.report_lang()
 513             compat_urllib_request.urlopen(request).read()
 514         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 515             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 516             return
 517
 518         # No authentication to be performed
 519         if username is None:
 520             return
 521
 522         request = compat_urllib_request.Request(self._LOGIN_URL)
 523         try:
 524             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 525         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 526             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 527             return
 528
 529         galx = None
 530         dsh = None
 531         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 532         if match:
 533           galx = match.group(1)
 534
 535         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 536         if match:
 537           dsh = match.group(1)
 538
 539         # Log in
 540         login_form_strs = {
 541                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 542                 u'Email': username,
 543                 u'GALX': galx,
 544                 u'Passwd': password,
 545                 u'PersistentCookie': u'yes',
 546                 u'_utf8': u'霱',
 547                 u'bgresponse': u'js_disabled',
 548                 u'checkConnection': u'',
 549                 u'checkedDomains': u'youtube',
 550                 u'dnConn': u'',
 551                 u'dsh': dsh,
 552                 u'pstMsg': u'0',
 553                 u'rmShown': u'1',
 554                 u'secTok': u'',
 555                 u'signIn': u'Sign in',
 556                 u'timeStmp': u'',
 557                 u'service': u'youtube',
 558                 u'uilel': u'3',
 559                 u'hl': u'en_US',
 560         }
 561         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 562         # chokes on unicode
 563         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 564         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 565         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 566         try:
 567             self.report_login()
 568             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 569             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 570                 self._downloader.report_warning(u'unable to log in: bad username or password')
 571                 return
 572         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 573             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 574             return
 575
 576         # Confirm age
 577         age_form = {
 578                 'next_url':     '/',
 579                 'action_confirm':   'Confirm',
 580                 }
 581         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 582         try:
 583             self.report_age_confirmation()
 584             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 585         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 586             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 587
 588     def _extract_id(self, url):
 589         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 590         if mobj is None:
 591             raise ExtractorError(u'Invalid URL: %s' % url)
 592         video_id = mobj.group(2)
 593         return video_id
 594
 595     def _real_extract(self, url):
 596         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 597         mobj = re.search(self._NEXT_URL_RE, url)
 598         if mobj:
 599             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 600         video_id = self._extract_id(url)
 601
 602         # Get video webpage
 603         self.report_video_webpage_download(video_id)
 604         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 605         request = compat_urllib_request.Request(url)
 606         try:
 607             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 608         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 609             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 610
 611         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 612
 613         # Attempt to extract SWF player URL
 614         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 615         if mobj is not None:
 616             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 617         else:
 618             player_url = None
 619
 620         # Get video info
 621         self.report_video_info_webpage_download(video_id)
 622         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 623             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 624                     % (video_id, el_type))
 625             video_info_webpage = self._download_webpage(video_info_url, video_id,
 626                                     note=False,
 627                                     errnote='unable to download video info webpage')
 628             video_info = compat_parse_qs(video_info_webpage)
 629             if 'token' in video_info:
 630                 break
 631         if 'token' not in video_info:
 632             if 'reason' in video_info:
 633                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 634             else:
 635                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 636
 637         # Check for "rental" videos
 638         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 639             raise ExtractorError(u'"rental" videos not supported')
 640
 641         # Start extracting information
 642         self.report_information_extraction(video_id)
 643
 644         # uploader
 645         if 'author' not in video_info:
 646             raise ExtractorError(u'Unable to extract uploader name')
 647         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 648
 649         # uploader_id
 650         video_uploader_id = None
 651         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 652         if mobj is not None:
 653             video_uploader_id = mobj.group(1)
 654         else:
 655             self._downloader.report_warning(u'unable to extract uploader nickname')
 656
 657         # title
 658         if 'title' not in video_info:
 659             raise ExtractorError(u'Unable to extract video title')
 660         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 661
 662         # thumbnail image
 663         if 'thumbnail_url' not in video_info:
 664             self._downloader.report_warning(u'unable to extract video thumbnail')
 665             video_thumbnail = ''
 666         else:   # don't panic if we can't find it
 667             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 668
 669         # upload date
 670         upload_date = None
 671         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 672         if mobj is not None:
 673             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 674             upload_date = unified_strdate(upload_date)
 675
 676         # description
 677         video_description = get_element_by_id("eow-description", video_webpage)
 678         if video_description:
 679             video_description = clean_html(video_description)
 680         else:
 681             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 682             if fd_mobj:
 683                 video_description = unescapeHTML(fd_mobj.group(1))
 684             else:
 685                 video_description = u''
 686
 687         # subtitles
 688         video_subtitles = None
 689
 690         if self._downloader.params.get('writesubtitles', False):
 691             video_subtitles = self._extract_subtitle(video_id)
 692             if video_subtitles:
 693                 (sub_error, sub_lang, sub) = video_subtitles[0]
 694                 if sub_error:
 695                     # We try with the automatic captions
 696                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 697                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 698                     if sub is not None:
 699                         pass
 700                     else:
 701                         # We report the original error
 702                         self._downloader.report_warning(sub_error)
 703
 704         if self._downloader.params.get('allsubtitles', False):
 705             video_subtitles = self._extract_all_subtitles(video_id)
 706             for video_subtitle in video_subtitles:
 707                 (sub_error, sub_lang, sub) = video_subtitle
 708                 if sub_error:
 709                     self._downloader.report_warning(sub_error)
 710
 711         if self._downloader.params.get('listsubtitles', False):
 712             sub_lang_list = self._list_available_subtitles(video_id)
 713             return
 714
 715         if 'length_seconds' not in video_info:
 716             self._downloader.report_warning(u'unable to extract video duration')
 717             video_duration = ''
 718         else:
 719             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 720
 721         # token
 722         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 723
 724         # Decide which formats to download
 725         req_format = self._downloader.params.get('format', None)
 726
 727         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 728             self.report_rtmp_download()
 729             video_url_list = [(None, video_info['conn'][0])]
 730         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 731             url_map = {}
 732             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 733                 url_data = compat_parse_qs(url_data_str)
 734                 if 'itag' in url_data and 'url' in url_data:
 735                     url = url_data['url'][0]
 736                     if 'sig' in url_data:
 737                         url += '&signature=' + url_data['sig'][0]
 738                     if 'ratebypass' not in url:
 739                         url += '&ratebypass=yes'
 740                     url_map[url_data['itag'][0]] = url
 741
 742             format_limit = self._downloader.params.get('format_limit', None)
 743             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 744             if format_limit is not None and format_limit in available_formats:
 745                 format_list = available_formats[available_formats.index(format_limit):]
 746             else:
 747                 format_list = available_formats
 748             existing_formats = [x for x in format_list if x in url_map]
 749             if len(existing_formats) == 0:
 750                 raise ExtractorError(u'no known formats available for video')
 751             if self._downloader.params.get('listformats', None):
 752                 self._print_formats(existing_formats)
 753                 return
 754             if req_format is None or req_format == 'best':
 755                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 756             elif req_format == 'worst':
 757                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 758             elif req_format in ('-1', 'all'):
 759                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 760             else:
 761                 # Specific formats. We pick the first in a slash-delimeted sequence.
 762                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 763                 req_formats = req_format.split('/')
 764                 video_url_list = None
 765                 for rf in req_formats:
 766                     if rf in url_map:
 767                         video_url_list = [(rf, url_map[rf])]
 768                         break
 769                 if video_url_list is None:
 770                     raise ExtractorError(u'requested format not available')
 771         else:
 772             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 773
 774         results = []
 775         for format_param, video_real_url in video_url_list:
 776             # Extension
 777             video_extension = self._video_extensions.get(format_param, 'flv')
 778
 779             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 780                                               self._video_dimensions.get(format_param, '???'))
 781
 782             results.append({
 783                 'id':       video_id,
 784                 'url':      video_real_url,
 785                 'uploader': video_uploader,
 786                 'uploader_id': video_uploader_id,
 787                 'upload_date':  upload_date,
 788                 'title':    video_title,
 789                 'ext':      video_extension,
 790                 'format':   video_format,
 791                 'thumbnail':    video_thumbnail,
 792                 'description':  video_description,
 793                 'player_url':   player_url,
 794                 'subtitles':    video_subtitles,
 795                 'duration':     video_duration
 796             })
 797         return results
 798
 799
 800 class MetacafeIE(InfoExtractor):
 801     """Information Extractor for metacafe.com."""
 802
 803     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 804     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 805     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 806     IE_NAME = u'metacafe'
 807
 808     def report_disclaimer(self):
 809         """Report disclaimer retrieval."""
 810         self.to_screen(u'Retrieving disclaimer')
 811
 812     def _real_initialize(self):
 813         # Retrieve disclaimer
 814         request = compat_urllib_request.Request(self._DISCLAIMER)
 815         try:
 816             self.report_disclaimer()
 817             disclaimer = compat_urllib_request.urlopen(request).read()
 818         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 819             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 820
 821         # Confirm age
 822         disclaimer_form = {
 823             'filters': '0',
 824             'submit': "Continue - I'm over 18",
 825             }
 826         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 827         try:
 828             self.report_age_confirmation()
 829             disclaimer = compat_urllib_request.urlopen(request).read()
 830         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 831             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 832
 833     def _real_extract(self, url):
 834         # Extract id and simplified title from URL
 835         mobj = re.match(self._VALID_URL, url)
 836         if mobj is None:
 837             raise ExtractorError(u'Invalid URL: %s' % url)
 838
 839         video_id = mobj.group(1)
 840
 841         # Check if video comes from YouTube
 842         mobj2 = re.match(r'^yt-(.*)$', video_id)
 843         if mobj2 is not None:
 844             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 845
 846         # Retrieve video webpage to extract further information
 847         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 848
 849         # Extract URL, uploader and title from webpage
 850         self.report_extraction(video_id)
 851         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 852         if mobj is not None:
 853             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 854             video_extension = mediaURL[-3:]
 855
 856             # Extract gdaKey if available
 857             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 858             if mobj is None:
 859                 video_url = mediaURL
 860             else:
 861                 gdaKey = mobj.group(1)
 862                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 863         else:
 864             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 865             if mobj is None:
 866                 raise ExtractorError(u'Unable to extract media URL')
 867             vardict = compat_parse_qs(mobj.group(1))
 868             if 'mediaData' not in vardict:
 869                 raise ExtractorError(u'Unable to extract media URL')
 870             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 871             if mobj is None:
 872                 raise ExtractorError(u'Unable to extract media URL')
 873             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 874             video_extension = mediaURL[-3:]
 875             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 876
 877         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 878         if mobj is None:
 879             raise ExtractorError(u'Unable to extract title')
 880         video_title = mobj.group(1).decode('utf-8')
 881
 882         mobj = re.search(r'submitter=(.*?);', webpage)
 883         if mobj is None:
 884             raise ExtractorError(u'Unable to extract uploader nickname')
 885         video_uploader = mobj.group(1)
 886
 887         return [{
 888             'id':       video_id.decode('utf-8'),
 889             'url':      video_url.decode('utf-8'),
 890             'uploader': video_uploader.decode('utf-8'),
 891             'upload_date':  None,
 892             'title':    video_title,
 893             'ext':      video_extension.decode('utf-8'),
 894         }]
 895
 896 class DailymotionIE(InfoExtractor):
 897     """Information Extractor for Dailymotion"""
 898
 899     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 900     IE_NAME = u'dailymotion'
 901
 902     def _real_extract(self, url):
 903         # Extract id and simplified title from URL
 904         mobj = re.match(self._VALID_URL, url)
 905         if mobj is None:
 906             raise ExtractorError(u'Invalid URL: %s' % url)
 907
 908         video_id = mobj.group(1).split('_')[0].split('?')[0]
 909
 910         video_extension = 'mp4'
 911
 912         # Retrieve video webpage to extract further information
 913         request = compat_urllib_request.Request(url)
 914         request.add_header('Cookie', 'family_filter=off')
 915         webpage = self._download_webpage(request, video_id)
 916
 917         # Extract URL, uploader and title from webpage
 918         self.report_extraction(video_id)
 919         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 920         if mobj is None:
 921             raise ExtractorError(u'Unable to extract media URL')
 922         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 923
 924         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 925             if key in flashvars:
 926                 max_quality = key
 927                 self.to_screen(u'Using %s' % key)
 928                 break
 929         else:
 930             raise ExtractorError(u'Unable to extract video URL')
 931
 932         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 933         if mobj is None:
 934             raise ExtractorError(u'Unable to extract video URL')
 935
 936         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 937
 938         # TODO: support choosing qualities
 939
 940         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 941         if mobj is None:
 942             raise ExtractorError(u'Unable to extract title')
 943         video_title = unescapeHTML(mobj.group('title'))
 944
 945         video_uploader = None
 946         video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
 947                                              # Looking for official user
 948                                              r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
 949                                             webpage, 'video uploader')
 950
 951         video_upload_date = None
 952         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 953         if mobj is not None:
 954             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 955
 956         return [{
 957             'id':       video_id,
 958             'url':      video_url,
 959             'uploader': video_uploader,
 960             'upload_date':  video_upload_date,
 961             'title':    video_title,
 962             'ext':      video_extension,
 963         }]
 964
 965
 966 class PhotobucketIE(InfoExtractor):
 967     """Information extractor for photobucket.com."""
 968
 969     # TODO: the original _VALID_URL was:
 970     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 971     # Check if it's necessary to keep the old extracion process
 972     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 973     IE_NAME = u'photobucket'
 974
 975     def _real_extract(self, url):
 976         # Extract id from URL
 977         mobj = re.match(self._VALID_URL, url)
 978         if mobj is None:
 979             raise ExtractorError(u'Invalid URL: %s' % url)
 980
 981         video_id = mobj.group('id')
 982
 983         video_extension = mobj.group('ext')
 984
 985         # Retrieve video webpage to extract further information
 986         webpage = self._download_webpage(url, video_id)
 987
 988         # Extract URL, uploader, and title from webpage
 989         self.report_extraction(video_id)
 990         # We try first by looking the javascript code:
 991         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 992         if mobj is not None:
 993             info = json.loads(mobj.group('json'))
 994             return [{
 995                 'id':       video_id,
 996                 'url':      info[u'downloadUrl'],
 997                 'uploader': info[u'username'],
 998                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 999                 'title':    info[u'title'],
1000                 'ext':      video_extension,
1001                 'thumbnail': info[u'thumbUrl'],
1002             }]
1003
1004         # We try looking in other parts of the webpage
1005         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1006             webpage, u'video URL')
1007
1008         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1009         if mobj is None:
1010             raise ExtractorError(u'Unable to extract title')
1011         video_title = mobj.group(1).decode('utf-8')
1012         video_uploader = mobj.group(2).decode('utf-8')
1013
1014         return [{
1015             'id':       video_id.decode('utf-8'),
1016             'url':      video_url.decode('utf-8'),
1017             'uploader': video_uploader,
1018             'upload_date':  None,
1019             'title':    video_title,
1020             'ext':      video_extension.decode('utf-8'),
1021         }]
1022
1023
1024 class YahooIE(InfoExtractor):
1025     """Information extractor for screen.yahoo.com."""
1026     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1027
1028     def _real_extract(self, url):
1029         mobj = re.match(self._VALID_URL, url)
1030         if mobj is None:
1031             raise ExtractorError(u'Invalid URL: %s' % url)
1032         video_id = mobj.group('id')
1033         webpage = self._download_webpage(url, video_id)
1034         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1035
1036         if m_id is None:
1037             # TODO: Check which url parameters are required
1038             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1039             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1040             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1041                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1042                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1043                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1044                         '''
1045             self.report_extraction(video_id)
1046             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1047             if m_info is None:
1048                 raise ExtractorError(u'Unable to extract video info')
1049             video_title = m_info.group('title')
1050             video_description = m_info.group('description')
1051             video_thumb = m_info.group('thumb')
1052             video_date = m_info.group('date')
1053             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1054
1055             # TODO: Find a way to get mp4 videos
1056             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1057             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1058             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1059             video_url = m_rest.group('url')
1060             video_path = m_rest.group('path')
1061             if m_rest is None:
1062                 raise ExtractorError(u'Unable to extract video url')
1063
1064         else: # We have to use a different method if another id is defined
1065             long_id = m_id.group('new_id')
1066             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1067             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1068             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1069             info = json.loads(json_str)
1070             res = info[u'query'][u'results'][u'mediaObj'][0]
1071             stream = res[u'streams'][0]
1072             video_path = stream[u'path']
1073             video_url = stream[u'host']
1074             meta = res[u'meta']
1075             video_title = meta[u'title']
1076             video_description = meta[u'description']
1077             video_thumb = meta[u'thumbnail']
1078             video_date = None # I can't find it
1079
1080         info_dict = {
1081                      'id': video_id,
1082                      'url': video_url,
1083                      'play_path': video_path,
1084                      'title':video_title,
1085                      'description': video_description,
1086                      'thumbnail': video_thumb,
1087                      'upload_date': video_date,
1088                      'ext': 'flv',
1089                      }
1090         return info_dict
1091
1092 class VimeoIE(InfoExtractor):
1093     """Information extractor for vimeo.com."""
1094
1095     # _VALID_URL matches Vimeo URLs
1096     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1097     IE_NAME = u'vimeo'
1098
1099     def _real_extract(self, url, new_video=True):
1100         # Extract ID from URL
1101         mobj = re.match(self._VALID_URL, url)
1102         if mobj is None:
1103             raise ExtractorError(u'Invalid URL: %s' % url)
1104
1105         video_id = mobj.group('id')
1106         if not mobj.group('proto'):
1107             url = 'https://' + url
1108         if mobj.group('direct_link') or mobj.group('pro'):
1109             url = 'https://vimeo.com/' + video_id
1110
1111         # Retrieve video webpage to extract further information
1112         request = compat_urllib_request.Request(url, None, std_headers)
1113         webpage = self._download_webpage(request, video_id)
1114
1115         # Now we begin extracting as much information as we can from what we
1116         # retrieved. First we extract the information common to all extractors,
1117         # and latter we extract those that are Vimeo specific.
1118         self.report_extraction(video_id)
1119
1120         # Extract the config JSON
1121         try:
1122             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1123             config = json.loads(config)
1124         except:
1125             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1126                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1127             else:
1128                 raise ExtractorError(u'Unable to extract info section')
1129
1130         # Extract title
1131         video_title = config["video"]["title"]
1132
1133         # Extract uploader and uploader_id
1134         video_uploader = config["video"]["owner"]["name"]
1135         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1136
1137         # Extract video thumbnail
1138         video_thumbnail = config["video"]["thumbnail"]
1139
1140         # Extract video description
1141         video_description = get_element_by_attribute("itemprop", "description", webpage)
1142         if video_description: video_description = clean_html(video_description)
1143         else: video_description = u''
1144
1145         # Extract upload date
1146         video_upload_date = None
1147         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1148         if mobj is not None:
1149             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1150
1151         # Vimeo specific: extract request signature and timestamp
1152         sig = config['request']['signature']
1153         timestamp = config['request']['timestamp']
1154
1155         # Vimeo specific: extract video codec and quality information
1156         # First consider quality, then codecs, then take everything
1157         # TODO bind to format param
1158         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1159         files = { 'hd': [], 'sd': [], 'other': []}
1160         for codec_name, codec_extension in codecs:
1161             if codec_name in config["video"]["files"]:
1162                 if 'hd' in config["video"]["files"][codec_name]:
1163                     files['hd'].append((codec_name, codec_extension, 'hd'))
1164                 elif 'sd' in config["video"]["files"][codec_name]:
1165                     files['sd'].append((codec_name, codec_extension, 'sd'))
1166                 else:
1167                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1168
1169         for quality in ('hd', 'sd', 'other'):
1170             if len(files[quality]) > 0:
1171                 video_quality = files[quality][0][2]
1172                 video_codec = files[quality][0][0]
1173                 video_extension = files[quality][0][1]
1174                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1175                 break
1176         else:
1177             raise ExtractorError(u'No known codec found')
1178
1179         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1180                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1181
1182         return [{
1183             'id':       video_id,
1184             'url':      video_url,
1185             'uploader': video_uploader,
1186             'uploader_id': video_uploader_id,
1187             'upload_date':  video_upload_date,
1188             'title':    video_title,
1189             'ext':      video_extension,
1190             'thumbnail':    video_thumbnail,
1191             'description':  video_description,
1192         }]
1193
1194
1195 class ArteTvIE(InfoExtractor):
1196     """arte.tv information extractor."""
1197
1198     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1199     _LIVE_URL = r'index-[0-9]+\.html$'
1200
1201     IE_NAME = u'arte.tv'
1202
1203     def fetch_webpage(self, url):
1204         request = compat_urllib_request.Request(url)
1205         try:
1206             self.report_download_webpage(url)
1207             webpage = compat_urllib_request.urlopen(request).read()
1208         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1209             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1210         except ValueError as err:
1211             raise ExtractorError(u'Invalid URL: %s' % url)
1212         return webpage
1213
1214     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1215         page = self.fetch_webpage(url)
1216         mobj = re.search(regex, page, regexFlags)
1217         info = {}
1218
1219         if mobj is None:
1220             raise ExtractorError(u'Invalid URL: %s' % url)
1221
1222         for (i, key, err) in matchTuples:
1223             if mobj.group(i) is None:
1224                 raise ExtractorError(err)
1225             else:
1226                 info[key] = mobj.group(i)
1227
1228         return info
1229
1230     def extractLiveStream(self, url):
1231         video_lang = url.split('/')[-4]
1232         info = self.grep_webpage(
1233             url,
1234             r'src="(.*?/videothek_js.*?\.js)',
1235             0,
1236             [
1237                 (1, 'url', u'Invalid URL: %s' % url)
1238             ]
1239         )
1240         http_host = url.split('/')[2]
1241         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1242         info = self.grep_webpage(
1243             next_url,
1244             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1245                 '(http://.*?\.swf).*?' +
1246                 '(rtmp://.*?)\'',
1247             re.DOTALL,
1248             [
1249                 (1, 'path',   u'could not extract video path: %s' % url),
1250                 (2, 'player', u'could not extract video player: %s' % url),
1251                 (3, 'url',    u'could not extract video url: %s' % url)
1252             ]
1253         )
1254         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1255
1256     def extractPlus7Stream(self, url):
1257         video_lang = url.split('/')[-3]
1258         info = self.grep_webpage(
1259             url,
1260             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1261             0,
1262             [
1263                 (1, 'url', u'Invalid URL: %s' % url)
1264             ]
1265         )
1266         next_url = compat_urllib_parse.unquote(info.get('url'))
1267         info = self.grep_webpage(
1268             next_url,
1269             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1270             0,
1271             [
1272                 (1, 'url', u'Could not find <video> tag: %s' % url)
1273             ]
1274         )
1275         next_url = compat_urllib_parse.unquote(info.get('url'))
1276
1277         info = self.grep_webpage(
1278             next_url,
1279             r'<video id="(.*?)".*?>.*?' +
1280                 '<name>(.*?)</name>.*?' +
1281                 '<dateVideo>(.*?)</dateVideo>.*?' +
1282                 '<url quality="hd">(.*?)</url>',
1283             re.DOTALL,
1284             [
1285                 (1, 'id',    u'could not extract video id: %s' % url),
1286                 (2, 'title', u'could not extract video title: %s' % url),
1287                 (3, 'date',  u'could not extract video date: %s' % url),
1288                 (4, 'url',   u'could not extract video url: %s' % url)
1289             ]
1290         )
1291
1292         return {
1293             'id':           info.get('id'),
1294             'url':          compat_urllib_parse.unquote(info.get('url')),
1295             'uploader':     u'arte.tv',
1296             'upload_date':  unified_strdate(info.get('date')),
1297             'title':        info.get('title').decode('utf-8'),
1298             'ext':          u'mp4',
1299             'format':       u'NA',
1300             'player_url':   None,
1301         }
1302
1303     def _real_extract(self, url):
1304         video_id = url.split('/')[-1]
1305         self.report_extraction(video_id)
1306
1307         if re.search(self._LIVE_URL, video_id) is not None:
1308             self.extractLiveStream(url)
1309             return
1310         else:
1311             info = self.extractPlus7Stream(url)
1312
1313         return [info]
1314
1315
1316 class GenericIE(InfoExtractor):
1317     """Generic last-resort information extractor."""
1318
1319     _VALID_URL = r'.*'
1320     IE_NAME = u'generic'
1321
1322     def report_download_webpage(self, video_id):
1323         """Report webpage download."""
1324         if not self._downloader.params.get('test', False):
1325             self._downloader.report_warning(u'Falling back on generic information extractor.')
1326         super(GenericIE, self).report_download_webpage(video_id)
1327
1328     def report_following_redirect(self, new_url):
1329         """Report information extraction."""
1330         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1331
1332     def _test_redirect(self, url):
1333         """Check if it is a redirect, like url shorteners, in case return the new url."""
1334         class HeadRequest(compat_urllib_request.Request):
1335             def get_method(self):
1336                 return "HEAD"
1337
1338         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1339             """
1340             Subclass the HTTPRedirectHandler to make it use our
1341             HeadRequest also on the redirected URL
1342             """
1343             def redirect_request(self, req, fp, code, msg, headers, newurl):
1344                 if code in (301, 302, 303, 307):
1345                     newurl = newurl.replace(' ', '%20')
1346                     newheaders = dict((k,v) for k,v in req.headers.items()
1347                                       if k.lower() not in ("content-length", "content-type"))
1348                     return HeadRequest(newurl,
1349                                        headers=newheaders,
1350                                        origin_req_host=req.get_origin_req_host(),
1351                                        unverifiable=True)
1352                 else:
1353                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1354
1355         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1356             """
1357             Fallback to GET if HEAD is not allowed (405 HTTP error)
1358             """
1359             def http_error_405(self, req, fp, code, msg, headers):
1360                 fp.read()
1361                 fp.close()
1362
1363                 newheaders = dict((k,v) for k,v in req.headers.items()
1364                                   if k.lower() not in ("content-length", "content-type"))
1365                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1366                                                  headers=newheaders,
1367                                                  origin_req_host=req.get_origin_req_host(),
1368                                                  unverifiable=True))
1369
1370         # Build our opener
1371         opener = compat_urllib_request.OpenerDirector()
1372         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1373                         HTTPMethodFallback, HEADRedirectHandler,
1374                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1375             opener.add_handler(handler())
1376
1377         response = opener.open(HeadRequest(url))
1378         if response is None:
1379             raise ExtractorError(u'Invalid URL protocol')
1380         new_url = response.geturl()
1381
1382         if url == new_url:
1383             return False
1384
1385         self.report_following_redirect(new_url)
1386         return new_url
1387
1388     def _real_extract(self, url):
1389         new_url = self._test_redirect(url)
1390         if new_url: return [self.url_result(new_url)]
1391
1392         video_id = url.split('/')[-1]
1393         try:
1394             webpage = self._download_webpage(url, video_id)
1395         except ValueError as err:
1396             # since this is the last-resort InfoExtractor, if
1397             # this error is thrown, it'll be thrown here
1398             raise ExtractorError(u'Invalid URL: %s' % url)
1399
1400         self.report_extraction(video_id)
1401         # Start with something easy: JW Player in SWFObject
1402         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1403         if mobj is None:
1404             # Broaden the search a little bit
1405             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1406         if mobj is None:
1407             # Broaden the search a little bit: JWPlayer JS loader
1408             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1409         if mobj is None:
1410             # Try to find twitter cards info
1411             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1412         if mobj is None:
1413             # We look for Open Graph info:
1414             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1415             m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1416             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1417             if m_video_type is not None:
1418                 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
1419         if mobj is None:
1420             raise ExtractorError(u'Invalid URL: %s' % url)
1421
1422         # It's possible that one of the regexes
1423         # matched, but returned an empty group:
1424         if mobj.group(1) is None:
1425             raise ExtractorError(u'Invalid URL: %s' % url)
1426
1427         video_url = compat_urllib_parse.unquote(mobj.group(1))
1428         video_id = os.path.basename(video_url)
1429
1430         # here's a fun little line of code for you:
1431         video_extension = os.path.splitext(video_id)[1][1:]
1432         video_id = os.path.splitext(video_id)[0]
1433
1434         # it's tempting to parse this further, but you would
1435         # have to take into account all the variations like
1436         #   Video Title - Site Name
1437         #   Site Name | Video Title
1438         #   Video Title - Tagline | Site Name
1439         # and so on and so forth; it's just not practical
1440         video_title = self._html_search_regex(r'<title>(.*)</title>',
1441             webpage, u'video title')
1442
1443         # video uploader is domain name
1444         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1445             url, u'video uploader')
1446
1447         return [{
1448             'id':       video_id,
1449             'url':      video_url,
1450             'uploader': video_uploader,
1451             'upload_date':  None,
1452             'title':    video_title,
1453             'ext':      video_extension,
1454         }]
1455
1456
1457 class YoutubeSearchIE(SearchInfoExtractor):
1458     """Information Extractor for YouTube search queries."""
1459     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1460     _MAX_RESULTS = 1000
1461     IE_NAME = u'youtube:search'
1462     _SEARCH_KEY = 'ytsearch'
1463
1464     def report_download_page(self, query, pagenum):
1465         """Report attempt to download search page with given number."""
1466         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1467
1468     def _get_n_results(self, query, n):
1469         """Get a specified number of results for a query"""
1470
1471         video_ids = []
1472         pagenum = 0
1473         limit = n
1474
1475         while (50 * pagenum) < limit:
1476             self.report_download_page(query, pagenum+1)
1477             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1478             request = compat_urllib_request.Request(result_url)
1479             try:
1480                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1481             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1482                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1483             api_response = json.loads(data)['data']
1484
1485             if not 'items' in api_response:
1486                 raise ExtractorError(u'[youtube] No video results')
1487
1488             new_ids = list(video['id'] for video in api_response['items'])
1489             video_ids += new_ids
1490
1491             limit = min(n, api_response['totalItems'])
1492             pagenum += 1
1493
1494         if len(video_ids) > n:
1495             video_ids = video_ids[:n]
1496         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1497         return self.playlist_result(videos, query)
1498
1499
1500 class GoogleSearchIE(SearchInfoExtractor):
1501     """Information Extractor for Google Video search queries."""
1502     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1503     _MAX_RESULTS = 1000
1504     IE_NAME = u'video.google:search'
1505     _SEARCH_KEY = 'gvsearch'
1506
1507     def _get_n_results(self, query, n):
1508         """Get a specified number of results for a query"""
1509
1510         res = {
1511             '_type': 'playlist',
1512             'id': query,
1513             'entries': []
1514         }
1515
1516         for pagenum in itertools.count(1):
1517             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1518             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1519                                              note='Downloading result page ' + str(pagenum))
1520
1521             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1522                 e = {
1523                     '_type': 'url',
1524                     'url': mobj.group(1)
1525                 }
1526                 res['entries'].append(e)
1527
1528             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1529                 return res
1530
1531 class YahooSearchIE(SearchInfoExtractor):
1532     """Information Extractor for Yahoo! Video search queries."""
1533
1534     _MAX_RESULTS = 1000
1535     IE_NAME = u'screen.yahoo:search'
1536     _SEARCH_KEY = 'yvsearch'
1537
1538     def _get_n_results(self, query, n):
1539         """Get a specified number of results for a query"""
1540
1541         res = {
1542             '_type': 'playlist',
1543             'id': query,
1544             'entries': []
1545         }
1546         for pagenum in itertools.count(0):
1547             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1548             webpage = self._download_webpage(result_url, query,
1549                                              note='Downloading results page '+str(pagenum+1))
1550             info = json.loads(webpage)
1551             m = info[u'm']
1552             results = info[u'results']
1553
1554             for (i, r) in enumerate(results):
1555                 if (pagenum * 30) +i >= n:
1556                     break
1557                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1558                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1559                 res['entries'].append(e)
1560             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1561                 break
1562
1563         return res
1564
1565
1566 class YoutubePlaylistIE(InfoExtractor):
1567     """Information Extractor for YouTube playlists."""
1568
1569     _VALID_URL = r"""(?:
1570                         (?:https?://)?
1571                         (?:\w+\.)?
1572                         youtube\.com/
1573                         (?:
1574                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1575                            \? (?:.*?&)*? (?:p|a|list)=
1576                         |  p/
1577                         )
1578                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1579                         .*
1580                      |
1581                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1582                      )"""
1583     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1584     _MAX_RESULTS = 50
1585     IE_NAME = u'youtube:playlist'
1586
1587     @classmethod
1588     def suitable(cls, url):
1589         """Receives a URL and returns True if suitable for this IE."""
1590         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1591
1592     def _real_extract(self, url):
1593         # Extract playlist id
1594         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1595         if mobj is None:
1596             raise ExtractorError(u'Invalid URL: %s' % url)
1597
1598         # Download playlist videos from API
1599         playlist_id = mobj.group(1) or mobj.group(2)
1600         page_num = 1
1601         videos = []
1602
1603         while True:
1604             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1605             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1606
1607             try:
1608                 response = json.loads(page)
1609             except ValueError as err:
1610                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1611
1612             if 'feed' not in response:
1613                 raise ExtractorError(u'Got a malformed response from YouTube API')
1614             playlist_title = response['feed']['title']['$t']
1615             if 'entry' not in response['feed']:
1616                 # Number of videos is a multiple of self._MAX_RESULTS
1617                 break
1618
1619             for entry in response['feed']['entry']:
1620                 index = entry['yt$position']['$t']
1621                 if 'media$group' in entry and 'media$player' in entry['media$group']:
1622                     videos.append((index, entry['media$group']['media$player']['url']))
1623
1624             if len(response['feed']['entry']) < self._MAX_RESULTS:
1625                 break
1626             page_num += 1
1627
1628         videos = [v[1] for v in sorted(videos)]
1629
1630         url_results = [self.url_result(url, 'Youtube') for url in videos]
1631         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1632
1633
1634 class YoutubeChannelIE(InfoExtractor):
1635     """Information Extractor for YouTube channels."""
1636
1637     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1638     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1639     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1640     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1641     IE_NAME = u'youtube:channel'
1642
1643     def extract_videos_from_page(self, page):
1644         ids_in_page = []
1645         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1646             if mobj.group(1) not in ids_in_page:
1647                 ids_in_page.append(mobj.group(1))
1648         return ids_in_page
1649
1650     def _real_extract(self, url):
1651         # Extract channel id
1652         mobj = re.match(self._VALID_URL, url)
1653         if mobj is None:
1654             raise ExtractorError(u'Invalid URL: %s' % url)
1655
1656         # Download channel page
1657         channel_id = mobj.group(1)
1658         video_ids = []
1659         pagenum = 1
1660
1661         url = self._TEMPLATE_URL % (channel_id, pagenum)
1662         page = self._download_webpage(url, channel_id,
1663                                       u'Downloading page #%s' % pagenum)
1664
1665         # Extract video identifiers
1666         ids_in_page = self.extract_videos_from_page(page)
1667         video_ids.extend(ids_in_page)
1668
1669         # Download any subsequent channel pages using the json-based channel_ajax query
1670         if self._MORE_PAGES_INDICATOR in page:
1671             while True:
1672                 pagenum = pagenum + 1
1673
1674                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1675                 page = self._download_webpage(url, channel_id,
1676                                               u'Downloading page #%s' % pagenum)
1677
1678                 page = json.loads(page)
1679
1680                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1681                 video_ids.extend(ids_in_page)
1682
1683                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1684                     break
1685
1686         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1687
1688         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1689         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1690         return [self.playlist_result(url_entries, channel_id)]
1691
1692
1693 class YoutubeUserIE(InfoExtractor):
1694     """Information Extractor for YouTube users."""
1695
1696     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1697     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1698     _GDATA_PAGE_SIZE = 50
1699     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1700     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1701     IE_NAME = u'youtube:user'
1702
1703     def _real_extract(self, url):
1704         # Extract username
1705         mobj = re.match(self._VALID_URL, url)
1706         if mobj is None:
1707             raise ExtractorError(u'Invalid URL: %s' % url)
1708
1709         username = mobj.group(1)
1710
1711         # Download video ids using YouTube Data API. Result size per
1712         # query is limited (currently to 50 videos) so we need to query
1713         # page by page until there are no video ids - it means we got
1714         # all of them.
1715
1716         video_ids = []
1717         pagenum = 0
1718
1719         while True:
1720             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1721
1722             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1723             page = self._download_webpage(gdata_url, username,
1724                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1725
1726             # Extract video identifiers
1727             ids_in_page = []
1728
1729             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1730                 if mobj.group(1) not in ids_in_page:
1731                     ids_in_page.append(mobj.group(1))
1732
1733             video_ids.extend(ids_in_page)
1734
1735             # A little optimization - if current page is not
1736             # "full", ie. does not contain PAGE_SIZE video ids then
1737             # we can assume that this page is the last one - there
1738             # are no more ids on further pages - no need to query
1739             # again.
1740
1741             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1742                 break
1743
1744             pagenum += 1
1745
1746         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1747         url_results = [self.url_result(url, 'Youtube') for url in urls]
1748         return [self.playlist_result(url_results, playlist_title = username)]
1749
1750
1751 class BlipTVUserIE(InfoExtractor):
1752     """Information Extractor for blip.tv users."""
1753
1754     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1755     _PAGE_SIZE = 12
1756     IE_NAME = u'blip.tv:user'
1757
1758     def _real_extract(self, url):
1759         # Extract username
1760         mobj = re.match(self._VALID_URL, url)
1761         if mobj is None:
1762             raise ExtractorError(u'Invalid URL: %s' % url)
1763
1764         username = mobj.group(1)
1765
1766         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1767
1768         page = self._download_webpage(url, username, u'Downloading user page')
1769         mobj = re.search(r'data-users-id="([^"]+)"', page)
1770         page_base = page_base % mobj.group(1)
1771
1772
1773         # Download video ids using BlipTV Ajax calls. Result size per
1774         # query is limited (currently to 12 videos) so we need to query
1775         # page by page until there are no video ids - it means we got
1776         # all of them.
1777
1778         video_ids = []
1779         pagenum = 1
1780
1781         while True:
1782             url = page_base + "&page=" + str(pagenum)
1783             page = self._download_webpage(url, username,
1784                                           u'Downloading video ids from page %d' % pagenum)
1785
1786             # Extract video identifiers
1787             ids_in_page = []
1788
1789             for mobj in re.finditer(r'href="/([^"]+)"', page):
1790                 if mobj.group(1) not in ids_in_page:
1791                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1792
1793             video_ids.extend(ids_in_page)
1794
1795             # A little optimization - if current page is not
1796             # "full", ie. does not contain PAGE_SIZE video ids then
1797             # we can assume that this page is the last one - there
1798             # are no more ids on further pages - no need to query
1799             # again.
1800
1801             if len(ids_in_page) < self._PAGE_SIZE:
1802                 break
1803
1804             pagenum += 1
1805
1806         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1807         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1808         return [self.playlist_result(url_entries, playlist_title = username)]
1809
1810
1811 class DepositFilesIE(InfoExtractor):
1812     """Information extractor for depositfiles.com"""
1813
1814     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1815
1816     def _real_extract(self, url):
1817         file_id = url.split('/')[-1]
1818         # Rebuild url in english locale
1819         url = 'http://depositfiles.com/en/files/' + file_id
1820
1821         # Retrieve file webpage with 'Free download' button pressed
1822         free_download_indication = { 'gateway_result' : '1' }
1823         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1824         try:
1825             self.report_download_webpage(file_id)
1826             webpage = compat_urllib_request.urlopen(request).read()
1827         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1828             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1829
1830         # Search for the real file URL
1831         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1832         if (mobj is None) or (mobj.group(1) is None):
1833             # Try to figure out reason of the error.
1834             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1835             if (mobj is not None) and (mobj.group(1) is not None):
1836                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1837                 raise ExtractorError(u'%s' % restriction_message)
1838             else:
1839                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1840
1841         file_url = mobj.group(1)
1842         file_extension = os.path.splitext(file_url)[1][1:]
1843
1844         # Search for file title
1845         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1846
1847         return [{
1848             'id':       file_id.decode('utf-8'),
1849             'url':      file_url.decode('utf-8'),
1850             'uploader': None,
1851             'upload_date':  None,
1852             'title':    file_title,
1853             'ext':      file_extension.decode('utf-8'),
1854         }]
1855
1856
1857 class FacebookIE(InfoExtractor):
1858     """Information Extractor for Facebook"""
1859
1860     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1861     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1862     _NETRC_MACHINE = 'facebook'
1863     IE_NAME = u'facebook'
1864
1865     def report_login(self):
1866         """Report attempt to log in."""
1867         self.to_screen(u'Logging in')
1868
1869     def _real_initialize(self):
1870         if self._downloader is None:
1871             return
1872
1873         useremail = None
1874         password = None
1875         downloader_params = self._downloader.params
1876
1877         # Attempt to use provided username and password or .netrc data
1878         if downloader_params.get('username', None) is not None:
1879             useremail = downloader_params['username']
1880             password = downloader_params['password']
1881         elif downloader_params.get('usenetrc', False):
1882             try:
1883                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1884                 if info is not None:
1885                     useremail = info[0]
1886                     password = info[2]
1887                 else:
1888                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1889             except (IOError, netrc.NetrcParseError) as err:
1890                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1891                 return
1892
1893         if useremail is None:
1894             return
1895
1896         # Log in
1897         login_form = {
1898             'email': useremail,
1899             'pass': password,
1900             'login': 'Log+In'
1901             }
1902         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1903         try:
1904             self.report_login()
1905             login_results = compat_urllib_request.urlopen(request).read()
1906             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1907                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1908                 return
1909         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1910             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1911             return
1912
1913     def _real_extract(self, url):
1914         mobj = re.match(self._VALID_URL, url)
1915         if mobj is None:
1916             raise ExtractorError(u'Invalid URL: %s' % url)
1917         video_id = mobj.group('ID')
1918
1919         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1920         webpage = self._download_webpage(url, video_id)
1921
1922         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1923         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1924         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1925         if not m:
1926             raise ExtractorError(u'Cannot parse data')
1927         data = dict(json.loads(m.group(1)))
1928         params_raw = compat_urllib_parse.unquote(data['params'])
1929         params = json.loads(params_raw)
1930         video_data = params['video_data'][0]
1931         video_url = video_data.get('hd_src')
1932         if not video_url:
1933             video_url = video_data['sd_src']
1934         if not video_url:
1935             raise ExtractorError(u'Cannot find video URL')
1936         video_duration = int(video_data['video_duration'])
1937         thumbnail = video_data['thumbnail_src']
1938
1939         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1940             webpage, u'title')
1941
1942         info = {
1943             'id': video_id,
1944             'title': video_title,
1945             'url': video_url,
1946             'ext': 'mp4',
1947             'duration': video_duration,
1948             'thumbnail': thumbnail,
1949         }
1950         return [info]
1951
1952
1953 class BlipTVIE(InfoExtractor):
1954     """Information extractor for blip.tv"""
1955
1956     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1957     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1958     IE_NAME = u'blip.tv'
1959
1960     def report_direct_download(self, title):
1961         """Report information extraction."""
1962         self.to_screen(u'%s: Direct download detected' % title)
1963
1964     def _real_extract(self, url):
1965         mobj = re.match(self._VALID_URL, url)
1966         if mobj is None:
1967             raise ExtractorError(u'Invalid URL: %s' % url)
1968
1969         # See https://github.com/rg3/youtube-dl/issues/857
1970         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1971         if api_mobj is not None:
1972             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1973         urlp = compat_urllib_parse_urlparse(url)
1974         if urlp.path.startswith('/play/'):
1975             request = compat_urllib_request.Request(url)
1976             response = compat_urllib_request.urlopen(request)
1977             redirecturl = response.geturl()
1978             rurlp = compat_urllib_parse_urlparse(redirecturl)
1979             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1980             url = 'http://blip.tv/a/a-' + file_id
1981             return self._real_extract(url)
1982
1983
1984         if '?' in url:
1985             cchar = '&'
1986         else:
1987             cchar = '?'
1988         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1989         request = compat_urllib_request.Request(json_url)
1990         request.add_header('User-Agent', 'iTunes/10.6.1')
1991         self.report_extraction(mobj.group(1))
1992         info = None
1993         try:
1994             urlh = compat_urllib_request.urlopen(request)
1995             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1996                 basename = url.split('/')[-1]
1997                 title,ext = os.path.splitext(basename)
1998                 title = title.decode('UTF-8')
1999                 ext = ext.replace('.', '')
2000                 self.report_direct_download(title)
2001                 info = {
2002                     'id': title,
2003                     'url': url,
2004                     'uploader': None,
2005                     'upload_date': None,
2006                     'title': title,
2007                     'ext': ext,
2008                     'urlhandle': urlh
2009                 }
2010         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2011             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2012         if info is None: # Regular URL
2013             try:
2014                 json_code_bytes = urlh.read()
2015                 json_code = json_code_bytes.decode('utf-8')
2016             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2017                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2018
2019             try:
2020                 json_data = json.loads(json_code)
2021                 if 'Post' in json_data:
2022                     data = json_data['Post']
2023                 else:
2024                     data = json_data
2025
2026                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2027                 video_url = data['media']['url']
2028                 umobj = re.match(self._URL_EXT, video_url)
2029                 if umobj is None:
2030                     raise ValueError('Can not determine filename extension')
2031                 ext = umobj.group(1)
2032
2033                 info = {
2034                     'id': data['item_id'],
2035                     'url': video_url,
2036                     'uploader': data['display_name'],
2037                     'upload_date': upload_date,
2038                     'title': data['title'],
2039                     'ext': ext,
2040                     'format': data['media']['mimeType'],
2041                     'thumbnail': data['thumbnailUrl'],
2042                     'description': data['description'],
2043                     'player_url': data['embedUrl'],
2044                     'user_agent': 'iTunes/10.6.1',
2045                 }
2046             except (ValueError,KeyError) as err:
2047                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2048
2049         return [info]
2050
2051
2052 class MyVideoIE(InfoExtractor):
2053     """Information Extractor for myvideo.de."""
2054
2055     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2056     IE_NAME = u'myvideo'
2057
2058     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2059     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2060     # https://github.com/rg3/youtube-dl/pull/842
2061     def __rc4crypt(self,data, key):
2062         x = 0
2063         box = list(range(256))
2064         for i in list(range(256)):
2065             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2066             box[i], box[x] = box[x], box[i]
2067         x = 0
2068         y = 0
2069         out = ''
2070         for char in data:
2071             x = (x + 1) % 256
2072             y = (y + box[x]) % 256
2073             box[x], box[y] = box[y], box[x]
2074             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2075         return out
2076
2077     def __md5(self,s):
2078         return hashlib.md5(s).hexdigest().encode()
2079
2080     def _real_extract(self,url):
2081         mobj = re.match(self._VALID_URL, url)
2082         if mobj is None:
2083             raise ExtractorError(u'invalid URL: %s' % url)
2084
2085         video_id = mobj.group(1)
2086
2087         GK = (
2088           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2089           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2090           b'TnpsbA0KTVRkbU1tSTRNdz09'
2091         )
2092
2093         # Get video webpage
2094         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2095         webpage = self._download_webpage(webpage_url, video_id)
2096
2097         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2098         if mobj is not None:
2099             self.report_extraction(video_id)
2100             video_url = mobj.group(1) + '.flv'
2101
2102             video_title = self._html_search_regex('<title>([^<]+)</title>',
2103                 webpage, u'title')
2104
2105             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2106
2107             return [{
2108                 'id':       video_id,
2109                 'url':      video_url,
2110                 'uploader': None,
2111                 'upload_date':  None,
2112                 'title':    video_title,
2113                 'ext':      u'flv',
2114             }]
2115
2116         # try encxml
2117         mobj = re.search('var flashvars={(.+?)}', webpage)
2118         if mobj is None:
2119             raise ExtractorError(u'Unable to extract video')
2120
2121         params = {}
2122         encxml = ''
2123         sec = mobj.group(1)
2124         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2125             if not a == '_encxml':
2126                 params[a] = b
2127             else:
2128                 encxml = compat_urllib_parse.unquote(b)
2129         if not params.get('domain'):
2130             params['domain'] = 'www.myvideo.de'
2131         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2132         if 'flash_playertype=MTV' in xmldata_url:
2133             self._downloader.report_warning(u'avoiding MTV player')
2134             xmldata_url = (
2135                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2136                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2137             ) % video_id
2138
2139         # get enc data
2140         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2141         enc_data_b = binascii.unhexlify(enc_data)
2142         sk = self.__md5(
2143             base64.b64decode(base64.b64decode(GK)) +
2144             self.__md5(
2145                 str(video_id).encode('utf-8')
2146             )
2147         )
2148         dec_data = self.__rc4crypt(enc_data_b, sk)
2149
2150         # extracting infos
2151         self.report_extraction(video_id)
2152
2153         video_url = None
2154         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2155         if mobj:
2156             video_url = compat_urllib_parse.unquote(mobj.group(1))
2157             if 'myvideo2flash' in video_url:
2158                 self._downloader.report_warning(u'forcing RTMPT ...')
2159                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2160
2161         if not video_url:
2162             # extract non rtmp videos
2163             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2164             if mobj is None:
2165                 raise ExtractorError(u'unable to extract url')
2166             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2167
2168         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2169         video_file = compat_urllib_parse.unquote(video_file)
2170
2171         if not video_file.endswith('f4m'):
2172             ppath, prefix = video_file.split('.')
2173             video_playpath = '%s:%s' % (prefix, ppath)
2174             video_hls_playlist = ''
2175         else:
2176             video_playpath = ''
2177             video_hls_playlist = (
2178                 video_filepath + video_file
2179             ).replace('.f4m', '.m3u8')
2180
2181         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2182         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2183
2184         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2185             webpage, u'title')
2186
2187         return [{
2188             'id':                 video_id,
2189             'url':                video_url,
2190             'tc_url':             video_url,
2191             'uploader':           None,
2192             'upload_date':        None,
2193             'title':              video_title,
2194             'ext':                u'flv',
2195             'play_path':          video_playpath,
2196             'video_file':         video_file,
2197             'video_hls_playlist': video_hls_playlist,
2198             'player_url':         video_swfobj,
2199         }]
2200
2201
2202 class ComedyCentralIE(InfoExtractor):
2203     """Information extractor for The Daily Show and Colbert Report """
2204
2205     # urls can be abbreviations like :thedailyshow or :colbert
2206     # urls for episodes like:
2207     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2208     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2209     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2210     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2211                       |(https?://)?(www\.)?
2212                           (?P<showname>thedailyshow|colbertnation)\.com/
2213                          (full-episodes/(?P<episode>.*)|
2214                           (?P<clip>
2215                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2216                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2217                      $"""
2218
2219     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2220
2221     _video_extensions = {
2222         '3500': 'mp4',
2223         '2200': 'mp4',
2224         '1700': 'mp4',
2225         '1200': 'mp4',
2226         '750': 'mp4',
2227         '400': 'mp4',
2228     }
2229     _video_dimensions = {
2230         '3500': '1280x720',
2231         '2200': '960x540',
2232         '1700': '768x432',
2233         '1200': '640x360',
2234         '750': '512x288',
2235         '400': '384x216',
2236     }
2237
2238     @classmethod
2239     def suitable(cls, url):
2240         """Receives a URL and returns True if suitable for this IE."""
2241         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2242
2243     def _print_formats(self, formats):
2244         print('Available formats:')
2245         for x in formats:
2246             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2247
2248
2249     def _real_extract(self, url):
2250         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2251         if mobj is None:
2252             raise ExtractorError(u'Invalid URL: %s' % url)
2253
2254         if mobj.group('shortname'):
2255             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2256                 url = u'http://www.thedailyshow.com/full-episodes/'
2257             else:
2258                 url = u'http://www.colbertnation.com/full-episodes/'
2259             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2260             assert mobj is not None
2261
2262         if mobj.group('clip'):
2263             if mobj.group('showname') == 'thedailyshow':
2264                 epTitle = mobj.group('tdstitle')
2265             else:
2266                 epTitle = mobj.group('cntitle')
2267             dlNewest = False
2268         else:
2269             dlNewest = not mobj.group('episode')
2270             if dlNewest:
2271                 epTitle = mobj.group('showname')
2272             else:
2273                 epTitle = mobj.group('episode')
2274
2275         self.report_extraction(epTitle)
2276         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2277         if dlNewest:
2278             url = htmlHandle.geturl()
2279             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2280             if mobj is None:
2281                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2282             if mobj.group('episode') == '':
2283                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2284             epTitle = mobj.group('episode')
2285
2286         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2287
2288         if len(mMovieParams) == 0:
2289             # The Colbert Report embeds the information in a without
2290             # a URL prefix; so extract the alternate reference
2291             # and then add the URL prefix manually.
2292
2293             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2294             if len(altMovieParams) == 0:
2295                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2296             else:
2297                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2298
2299         uri = mMovieParams[0][1]
2300         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2301         indexXml = self._download_webpage(indexUrl, epTitle,
2302                                           u'Downloading show index',
2303                                           u'unable to download episode index')
2304
2305         results = []
2306
2307         idoc = xml.etree.ElementTree.fromstring(indexXml)
2308         itemEls = idoc.findall('.//item')
2309         for partNum,itemEl in enumerate(itemEls):
2310             mediaId = itemEl.findall('./guid')[0].text
2311             shortMediaId = mediaId.split(':')[-1]
2312             showId = mediaId.split(':')[-2].replace('.com', '')
2313             officialTitle = itemEl.findall('./title')[0].text
2314             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2315
2316             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2317                         compat_urllib_parse.urlencode({'uri': mediaId}))
2318             configXml = self._download_webpage(configUrl, epTitle,
2319                                                u'Downloading configuration for %s' % shortMediaId)
2320
2321             cdoc = xml.etree.ElementTree.fromstring(configXml)
2322             turls = []
2323             for rendition in cdoc.findall('.//rendition'):
2324                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2325                 turls.append(finfo)
2326
2327             if len(turls) == 0:
2328                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2329                 continue
2330
2331             if self._downloader.params.get('listformats', None):
2332                 self._print_formats([i[0] for i in turls])
2333                 return
2334
2335             # For now, just pick the highest bitrate
2336             format,rtmp_video_url = turls[-1]
2337
2338             # Get the format arg from the arg stream
2339             req_format = self._downloader.params.get('format', None)
2340
2341             # Select format if we can find one
2342             for f,v in turls:
2343                 if f == req_format:
2344                     format, rtmp_video_url = f, v
2345                     break
2346
2347             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2348             if not m:
2349                 raise ExtractorError(u'Cannot transform RTMP url')
2350             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2351             video_url = base + m.group('finalid')
2352
2353             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2354             info = {
2355                 'id': shortMediaId,
2356                 'url': video_url,
2357                 'uploader': showId,
2358                 'upload_date': officialDate,
2359                 'title': effTitle,
2360                 'ext': 'mp4',
2361                 'format': format,
2362                 'thumbnail': None,
2363                 'description': officialTitle,
2364             }
2365             results.append(info)
2366
2367         return results
2368
2369
2370 class EscapistIE(InfoExtractor):
2371     """Information extractor for The Escapist """
2372
2373     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2374     IE_NAME = u'escapist'
2375
2376     def _real_extract(self, url):
2377         mobj = re.match(self._VALID_URL, url)
2378         if mobj is None:
2379             raise ExtractorError(u'Invalid URL: %s' % url)
2380         showName = mobj.group('showname')
2381         videoId = mobj.group('episode')
2382
2383         self.report_extraction(videoId)
2384         webpage = self._download_webpage(url, videoId)
2385
2386         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2387             webpage, u'description', fatal=False)
2388
2389         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2390             webpage, u'thumbnail', fatal=False)
2391
2392         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2393             webpage, u'player url')
2394
2395         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2396             webpage, u'player url').split(' : ')[-1]
2397
2398         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2399         configUrl = compat_urllib_parse.unquote(configUrl)
2400
2401         configJSON = self._download_webpage(configUrl, videoId,
2402                                             u'Downloading configuration',
2403                                             u'unable to download configuration')
2404
2405         # Technically, it's JavaScript, not JSON
2406         configJSON = configJSON.replace("'", '"')
2407
2408         try:
2409             config = json.loads(configJSON)
2410         except (ValueError,) as err:
2411             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2412
2413         playlist = config['playlist']
2414         videoUrl = playlist[1]['url']
2415
2416         info = {
2417             'id': videoId,
2418             'url': videoUrl,
2419             'uploader': showName,
2420             'upload_date': None,
2421             'title': title,
2422             'ext': 'mp4',
2423             'thumbnail': imgUrl,
2424             'description': videoDesc,
2425             'player_url': playerUrl,
2426         }
2427
2428         return [info]
2429
2430 class CollegeHumorIE(InfoExtractor):
2431     """Information extractor for collegehumor.com"""
2432
2433     _WORKING = False
2434     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2435     IE_NAME = u'collegehumor'
2436
2437     def report_manifest(self, video_id):
2438         """Report information extraction."""
2439         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2440
2441     def _real_extract(self, url):
2442         mobj = re.match(self._VALID_URL, url)
2443         if mobj is None:
2444             raise ExtractorError(u'Invalid URL: %s' % url)
2445         video_id = mobj.group('videoid')
2446
2447         info = {
2448             'id': video_id,
2449             'uploader': None,
2450             'upload_date': None,
2451         }
2452
2453         self.report_extraction(video_id)
2454         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2455         try:
2456             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2457         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2458             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2459
2460         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2461         try:
2462             videoNode = mdoc.findall('./video')[0]
2463             info['description'] = videoNode.findall('./description')[0].text
2464             info['title'] = videoNode.findall('./caption')[0].text
2465             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2466             manifest_url = videoNode.findall('./file')[0].text
2467         except IndexError:
2468             raise ExtractorError(u'Invalid metadata XML file')
2469
2470         manifest_url += '?hdcore=2.10.3'
2471         self.report_manifest(video_id)
2472         try:
2473             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2474         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2475             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2476
2477         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2478         try:
2479             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2480             node_id = media_node.attrib['url']
2481             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2482         except IndexError as err:
2483             raise ExtractorError(u'Invalid manifest file')
2484
2485         url_pr = compat_urllib_parse_urlparse(manifest_url)
2486         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2487
2488         info['url'] = url
2489         info['ext'] = 'f4f'
2490         return [info]
2491
2492
2493 class XVideosIE(InfoExtractor):
2494     """Information extractor for xvideos.com"""
2495
2496     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2497     IE_NAME = u'xvideos'
2498
2499     def _real_extract(self, url):
2500         mobj = re.match(self._VALID_URL, url)
2501         if mobj is None:
2502             raise ExtractorError(u'Invalid URL: %s' % url)
2503         video_id = mobj.group(1)
2504
2505         webpage = self._download_webpage(url, video_id)
2506
2507         self.report_extraction(video_id)
2508
2509         # Extract video URL
2510         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2511             webpage, u'video URL'))
2512
2513         # Extract title
2514         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2515             webpage, u'title')
2516
2517         # Extract video thumbnail
2518         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2519             webpage, u'thumbnail', fatal=False)
2520
2521         info = {
2522             'id': video_id,
2523             'url': video_url,
2524             'uploader': None,
2525             'upload_date': None,
2526             'title': video_title,
2527             'ext': 'flv',
2528             'thumbnail': video_thumbnail,
2529             'description': None,
2530         }
2531
2532         return [info]
2533
2534
2535 class SoundcloudIE(InfoExtractor):
2536     """Information extractor for soundcloud.com
2537        To access the media, the uid of the song and a stream token
2538        must be extracted from the page source and the script must make
2539        a request to media.soundcloud.com/crossdomain.xml. Then
2540        the media can be grabbed by requesting from an url composed
2541        of the stream token and uid
2542      """
2543
2544     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2545     IE_NAME = u'soundcloud'
2546
2547     def report_resolve(self, video_id):
2548         """Report information extraction."""
2549         self.to_screen(u'%s: Resolving id' % video_id)
2550
2551     def _real_extract(self, url):
2552         mobj = re.match(self._VALID_URL, url)
2553         if mobj is None:
2554             raise ExtractorError(u'Invalid URL: %s' % url)
2555
2556         # extract uploader (which is in the url)
2557         uploader = mobj.group(1)
2558         # extract simple title (uploader + slug of song title)
2559         slug_title =  mobj.group(2)
2560         simple_title = uploader + u'-' + slug_title
2561         full_title = '%s/%s' % (uploader, slug_title)
2562
2563         self.report_resolve(full_title)
2564
2565         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2566         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2567         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2568
2569         info = json.loads(info_json)
2570         video_id = info['id']
2571         self.report_extraction(full_title)
2572
2573         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2574         stream_json = self._download_webpage(streams_url, full_title,
2575                                              u'Downloading stream definitions',
2576                                              u'unable to download stream definitions')
2577
2578         streams = json.loads(stream_json)
2579         mediaURL = streams['http_mp3_128_url']
2580         upload_date = unified_strdate(info['created_at'])
2581
2582         return [{
2583             'id':       info['id'],
2584             'url':      mediaURL,
2585             'uploader': info['user']['username'],
2586             'upload_date': upload_date,
2587             'title':    info['title'],
2588             'ext':      u'mp3',
2589             'description': info['description'],
2590         }]
2591
2592 class SoundcloudSetIE(InfoExtractor):
2593     """Information extractor for soundcloud.com sets
2594        To access the media, the uid of the song and a stream token
2595        must be extracted from the page source and the script must make
2596        a request to media.soundcloud.com/crossdomain.xml. Then
2597        the media can be grabbed by requesting from an url composed
2598        of the stream token and uid
2599      """
2600
2601     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2602     IE_NAME = u'soundcloud:set'
2603
2604     def report_resolve(self, video_id):
2605         """Report information extraction."""
2606         self.to_screen(u'%s: Resolving id' % video_id)
2607
2608     def _real_extract(self, url):
2609         mobj = re.match(self._VALID_URL, url)
2610         if mobj is None:
2611             raise ExtractorError(u'Invalid URL: %s' % url)
2612
2613         # extract uploader (which is in the url)
2614         uploader = mobj.group(1)
2615         # extract simple title (uploader + slug of song title)
2616         slug_title =  mobj.group(2)
2617         simple_title = uploader + u'-' + slug_title
2618         full_title = '%s/sets/%s' % (uploader, slug_title)
2619
2620         self.report_resolve(full_title)
2621
2622         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2623         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2624         info_json = self._download_webpage(resolv_url, full_title)
2625
2626         videos = []
2627         info = json.loads(info_json)
2628         if 'errors' in info:
2629             for err in info['errors']:
2630                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2631             return
2632
2633         self.report_extraction(full_title)
2634         for track in info['tracks']:
2635             video_id = track['id']
2636
2637             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2638             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2639
2640             self.report_extraction(video_id)
2641             streams = json.loads(stream_json)
2642             mediaURL = streams['http_mp3_128_url']
2643
2644             videos.append({
2645                 'id':       video_id,
2646                 'url':      mediaURL,
2647                 'uploader': track['user']['username'],
2648                 'upload_date':  unified_strdate(track['created_at']),
2649                 'title':    track['title'],
2650                 'ext':      u'mp3',
2651                 'description': track['description'],
2652             })
2653         return videos
2654
2655
2656 class InfoQIE(InfoExtractor):
2657     """Information extractor for infoq.com"""
2658     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2659
2660     def _real_extract(self, url):
2661         mobj = re.match(self._VALID_URL, url)
2662         if mobj is None:
2663             raise ExtractorError(u'Invalid URL: %s' % url)
2664
2665         webpage = self._download_webpage(url, video_id=url)
2666         self.report_extraction(url)
2667
2668         # Extract video URL
2669         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2670         if mobj is None:
2671             raise ExtractorError(u'Unable to extract video url')
2672         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2673         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2674
2675         # Extract title
2676         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2677             webpage, u'title')
2678
2679         # Extract description
2680         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2681             webpage, u'description', fatal=False)
2682
2683         video_filename = video_url.split('/')[-1]
2684         video_id, extension = video_filename.split('.')
2685
2686         info = {
2687             'id': video_id,
2688             'url': video_url,
2689             'uploader': None,
2690             'upload_date': None,
2691             'title': video_title,
2692             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2693             'thumbnail': None,
2694             'description': video_description,
2695         }
2696
2697         return [info]
2698
2699 class MixcloudIE(InfoExtractor):
2700     """Information extractor for www.mixcloud.com"""
2701
2702     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2703     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2704     IE_NAME = u'mixcloud'
2705
2706     def report_download_json(self, file_id):
2707         """Report JSON download."""
2708         self.to_screen(u'Downloading json')
2709
2710     def get_urls(self, jsonData, fmt, bitrate='best'):
2711         """Get urls from 'audio_formats' section in json"""
2712         file_url = None
2713         try:
2714             bitrate_list = jsonData[fmt]
2715             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2716                 bitrate = max(bitrate_list) # select highest
2717
2718             url_list = jsonData[fmt][bitrate]
2719         except TypeError: # we have no bitrate info.
2720             url_list = jsonData[fmt]
2721         return url_list
2722
2723     def check_urls(self, url_list):
2724         """Returns 1st active url from list"""
2725         for url in url_list:
2726             try:
2727                 compat_urllib_request.urlopen(url)
2728                 return url
2729             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2730                 url = None
2731
2732         return None
2733
2734     def _print_formats(self, formats):
2735         print('Available formats:')
2736         for fmt in formats.keys():
2737             for b in formats[fmt]:
2738                 try:
2739                     ext = formats[fmt][b][0]
2740                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2741                 except TypeError: # we have no bitrate info
2742                     ext = formats[fmt][0]
2743                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2744                     break
2745
2746     def _real_extract(self, url):
2747         mobj = re.match(self._VALID_URL, url)
2748         if mobj is None:
2749             raise ExtractorError(u'Invalid URL: %s' % url)
2750         # extract uploader & filename from url
2751         uploader = mobj.group(1).decode('utf-8')
2752         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2753
2754         # construct API request
2755         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2756         # retrieve .json file with links to files
2757         request = compat_urllib_request.Request(file_url)
2758         try:
2759             self.report_download_json(file_url)
2760             jsonData = compat_urllib_request.urlopen(request).read()
2761         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2762             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2763
2764         # parse JSON
2765         json_data = json.loads(jsonData)
2766         player_url = json_data['player_swf_url']
2767         formats = dict(json_data['audio_formats'])
2768
2769         req_format = self._downloader.params.get('format', None)
2770         bitrate = None
2771
2772         if self._downloader.params.get('listformats', None):
2773             self._print_formats(formats)
2774             return
2775
2776         if req_format is None or req_format == 'best':
2777             for format_param in formats.keys():
2778                 url_list = self.get_urls(formats, format_param)
2779                 # check urls
2780                 file_url = self.check_urls(url_list)
2781                 if file_url is not None:
2782                     break # got it!
2783         else:
2784             if req_format not in formats:
2785                 raise ExtractorError(u'Format is not available')
2786
2787             url_list = self.get_urls(formats, req_format)
2788             file_url = self.check_urls(url_list)
2789             format_param = req_format
2790
2791         return [{
2792             'id': file_id.decode('utf-8'),
2793             'url': file_url.decode('utf-8'),
2794             'uploader': uploader.decode('utf-8'),
2795             'upload_date': None,
2796             'title': json_data['name'],
2797             'ext': file_url.split('.')[-1].decode('utf-8'),
2798             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2799             'thumbnail': json_data['thumbnail_url'],
2800             'description': json_data['description'],
2801             'player_url': player_url.decode('utf-8'),
2802         }]
2803
2804 class StanfordOpenClassroomIE(InfoExtractor):
2805     """Information extractor for Stanford's Open ClassRoom"""
2806
2807     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2808     IE_NAME = u'stanfordoc'
2809
2810     def _real_extract(self, url):
2811         mobj = re.match(self._VALID_URL, url)
2812         if mobj is None:
2813             raise ExtractorError(u'Invalid URL: %s' % url)
2814
2815         if mobj.group('course') and mobj.group('video'): # A specific video
2816             course = mobj.group('course')
2817             video = mobj.group('video')
2818             info = {
2819                 'id': course + '_' + video,
2820                 'uploader': None,
2821                 'upload_date': None,
2822             }
2823
2824             self.report_extraction(info['id'])
2825             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2826             xmlUrl = baseUrl + video + '.xml'
2827             try:
2828                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2829             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2830                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2831             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2832             try:
2833                 info['title'] = mdoc.findall('./title')[0].text
2834                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2835             except IndexError:
2836                 raise ExtractorError(u'Invalid metadata XML file')
2837             info['ext'] = info['url'].rpartition('.')[2]
2838             return [info]
2839         elif mobj.group('course'): # A course page
2840             course = mobj.group('course')
2841             info = {
2842                 'id': course,
2843                 'type': 'playlist',
2844                 'uploader': None,
2845                 'upload_date': None,
2846             }
2847
2848             coursepage = self._download_webpage(url, info['id'],
2849                                         note='Downloading course info page',
2850                                         errnote='Unable to download course info page')
2851
2852             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2853
2854             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2855                 coursepage, u'description', fatal=False)
2856
2857             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2858             info['list'] = [
2859                 {
2860                     'type': 'reference',
2861                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2862                 }
2863                     for vpage in links]
2864             results = []
2865             for entry in info['list']:
2866                 assert entry['type'] == 'reference'
2867                 results += self.extract(entry['url'])
2868             return results
2869         else: # Root page
2870             info = {
2871                 'id': 'Stanford OpenClassroom',
2872                 'type': 'playlist',
2873                 'uploader': None,
2874                 'upload_date': None,
2875             }
2876
2877             self.report_download_webpage(info['id'])
2878             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2879             try:
2880                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2881             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2882                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2883
2884             info['title'] = info['id']
2885
2886             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2887             info['list'] = [
2888                 {
2889                     'type': 'reference',
2890                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2891                 }
2892                     for cpage in links]
2893
2894             results = []
2895             for entry in info['list']:
2896                 assert entry['type'] == 'reference'
2897                 results += self.extract(entry['url'])
2898             return results
2899
2900 class MTVIE(InfoExtractor):
2901     """Information extractor for MTV.com"""
2902
2903     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2904     IE_NAME = u'mtv'
2905
2906     def _real_extract(self, url):
2907         mobj = re.match(self._VALID_URL, url)
2908         if mobj is None:
2909             raise ExtractorError(u'Invalid URL: %s' % url)
2910         if not mobj.group('proto'):
2911             url = 'http://' + url
2912         video_id = mobj.group('videoid')
2913
2914         webpage = self._download_webpage(url, video_id)
2915
2916         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2917             webpage, u'song name', fatal=False)
2918
2919         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2920             webpage, u'title')
2921
2922         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2923             webpage, u'mtvn_uri', fatal=False)
2924
2925         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2926             webpage, u'content id', fatal=False)
2927
2928         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2929         self.report_extraction(video_id)
2930         request = compat_urllib_request.Request(videogen_url)
2931         try:
2932             metadataXml = compat_urllib_request.urlopen(request).read()
2933         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2934             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2935
2936         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2937         renditions = mdoc.findall('.//rendition')
2938
2939         # For now, always pick the highest quality.
2940         rendition = renditions[-1]
2941
2942         try:
2943             _,_,ext = rendition.attrib['type'].partition('/')
2944             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2945             video_url = rendition.find('./src').text
2946         except KeyError:
2947             raise ExtractorError('Invalid rendition field.')
2948
2949         info = {
2950             'id': video_id,
2951             'url': video_url,
2952             'uploader': performer,
2953             'upload_date': None,
2954             'title': video_title,
2955             'ext': ext,
2956             'format': format,
2957         }
2958
2959         return [info]
2960
2961
2962 class YoukuIE(InfoExtractor):
2963     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2964
2965     def _gen_sid(self):
2966         nowTime = int(time.time() * 1000)
2967         random1 = random.randint(1000,1998)
2968         random2 = random.randint(1000,9999)
2969
2970         return "%d%d%d" %(nowTime,random1,random2)
2971
2972     def _get_file_ID_mix_string(self, seed):
2973         mixed = []
2974         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2975         seed = float(seed)
2976         for i in range(len(source)):
2977             seed  =  (seed * 211 + 30031 ) % 65536
2978             index  =  math.floor(seed / 65536 * len(source) )
2979             mixed.append(source[int(index)])
2980             source.remove(source[int(index)])
2981         #return ''.join(mixed)
2982         return mixed
2983
2984     def _get_file_id(self, fileId, seed):
2985         mixed = self._get_file_ID_mix_string(seed)
2986         ids = fileId.split('*')
2987         realId = []
2988         for ch in ids:
2989             if ch:
2990                 realId.append(mixed[int(ch)])
2991         return ''.join(realId)
2992
2993     def _real_extract(self, url):
2994         mobj = re.match(self._VALID_URL, url)
2995         if mobj is None:
2996             raise ExtractorError(u'Invalid URL: %s' % url)
2997         video_id = mobj.group('ID')
2998
2999         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3000
3001         jsondata = self._download_webpage(info_url, video_id)
3002
3003         self.report_extraction(video_id)
3004         try:
3005             config = json.loads(jsondata)
3006
3007             video_title =  config['data'][0]['title']
3008             seed = config['data'][0]['seed']
3009
3010             format = self._downloader.params.get('format', None)
3011             supported_format = list(config['data'][0]['streamfileids'].keys())
3012
3013             if format is None or format == 'best':
3014                 if 'hd2' in supported_format:
3015                     format = 'hd2'
3016                 else:
3017                     format = 'flv'
3018                 ext = u'flv'
3019             elif format == 'worst':
3020                 format = 'mp4'
3021                 ext = u'mp4'
3022             else:
3023                 format = 'flv'
3024                 ext = u'flv'
3025
3026
3027             fileid = config['data'][0]['streamfileids'][format]
3028             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3029         except (UnicodeDecodeError, ValueError, KeyError):
3030             raise ExtractorError(u'Unable to extract info section')
3031
3032         files_info=[]
3033         sid = self._gen_sid()
3034         fileid = self._get_file_id(fileid, seed)
3035
3036         #column 8,9 of fileid represent the segment number
3037         #fileid[7:9] should be changed
3038         for index, key in enumerate(keys):
3039
3040             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3041             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3042
3043             info = {
3044                 'id': '%s_part%02d' % (video_id, index),
3045                 'url': download_url,
3046                 'uploader': None,
3047                 'upload_date': None,
3048                 'title': video_title,
3049                 'ext': ext,
3050             }
3051             files_info.append(info)
3052
3053         return files_info
3054
3055
3056 class XNXXIE(InfoExtractor):
3057     """Information extractor for xnxx.com"""
3058
3059     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3060     IE_NAME = u'xnxx'
3061     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3062     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3063     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3064
3065     def _real_extract(self, url):
3066         mobj = re.match(self._VALID_URL, url)
3067         if mobj is None:
3068             raise ExtractorError(u'Invalid URL: %s' % url)
3069         video_id = mobj.group(1)
3070
3071         # Get webpage content
3072         webpage = self._download_webpage(url, video_id)
3073
3074         video_url = self._search_regex(self.VIDEO_URL_RE,
3075             webpage, u'video URL')
3076         video_url = compat_urllib_parse.unquote(video_url)
3077
3078         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3079             webpage, u'title')
3080
3081         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3082             webpage, u'thumbnail', fatal=False)
3083
3084         return [{
3085             'id': video_id,
3086             'url': video_url,
3087             'uploader': None,
3088             'upload_date': None,
3089             'title': video_title,
3090             'ext': 'flv',
3091             'thumbnail': video_thumbnail,
3092             'description': None,
3093         }]
3094
3095
3096 class GooglePlusIE(InfoExtractor):
3097     """Information extractor for plus.google.com."""
3098
3099     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3100     IE_NAME = u'plus.google'
3101
3102     def _real_extract(self, url):
3103         # Extract id from URL
3104         mobj = re.match(self._VALID_URL, url)
3105         if mobj is None:
3106             raise ExtractorError(u'Invalid URL: %s' % url)
3107
3108         post_url = mobj.group(0)
3109         video_id = mobj.group(1)
3110
3111         video_extension = 'flv'
3112
3113         # Step 1, Retrieve post webpage to extract further information
3114         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3115
3116         self.report_extraction(video_id)
3117
3118         # Extract update date
3119         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3120             webpage, u'upload date', fatal=False)
3121         if upload_date:
3122             # Convert timestring to a format suitable for filename
3123             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3124             upload_date = upload_date.strftime('%Y%m%d')
3125
3126         # Extract uploader
3127         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3128             webpage, u'uploader', fatal=False)
3129
3130         # Extract title
3131         # Get the first line for title
3132         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3133             webpage, 'title', default=u'NA')
3134
3135         # Step 2, Stimulate clicking the image box to launch video
3136         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3137             webpage, u'video page URL')
3138         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3139
3140         # Extract video links on video page
3141         """Extract video links of all sizes"""
3142         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3143         mobj = re.findall(pattern, webpage)
3144         if len(mobj) == 0:
3145             raise ExtractorError(u'Unable to extract video links')
3146
3147         # Sort in resolution
3148         links = sorted(mobj)
3149
3150         # Choose the lowest of the sort, i.e. highest resolution
3151         video_url = links[-1]
3152         # Only get the url. The resolution part in the tuple has no use anymore
3153         video_url = video_url[-1]
3154         # Treat escaped \u0026 style hex
3155         try:
3156             video_url = video_url.decode("unicode_escape")
3157         except AttributeError: # Python 3
3158             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3159
3160
3161         return [{
3162             'id':       video_id,
3163             'url':      video_url,
3164             'uploader': uploader,
3165             'upload_date':  upload_date,
3166             'title':    video_title,
3167             'ext':      video_extension,
3168         }]
3169
3170 class NBAIE(InfoExtractor):
3171     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3172     IE_NAME = u'nba'
3173
3174     def _real_extract(self, url):
3175         mobj = re.match(self._VALID_URL, url)
3176         if mobj is None:
3177             raise ExtractorError(u'Invalid URL: %s' % url)
3178
3179         video_id = mobj.group(1)
3180
3181         webpage = self._download_webpage(url, video_id)
3182
3183         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3184
3185         shortened_video_id = video_id.rpartition('/')[2]
3186         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3187             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3188
3189         # It isn't there in the HTML it returns to us
3190         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3191
3192         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3193
3194         info = {
3195             'id': shortened_video_id,
3196             'url': video_url,
3197             'ext': 'mp4',
3198             'title': title,
3199             # 'uploader_date': uploader_date,
3200             'description': description,
3201         }
3202         return [info]
3203
3204 class JustinTVIE(InfoExtractor):
3205     """Information extractor for justin.tv and twitch.tv"""
3206     # TODO: One broadcast may be split into multiple videos. The key
3207     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3208     # starts at 1 and increases. Can we treat all parts as one video?
3209
3210     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3211         (?:
3212             (?P<channelid>[^/]+)|
3213             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3214             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3215         )
3216         /?(?:\#.*)?$
3217         """
3218     _JUSTIN_PAGE_LIMIT = 100
3219     IE_NAME = u'justin.tv'
3220
3221     def report_download_page(self, channel, offset):
3222         """Report attempt to download a single page of videos."""
3223         self.to_screen(u'%s: Downloading video information from %d to %d' %
3224                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3225
3226     # Return count of items, list of *valid* items
3227     def _parse_page(self, url, video_id):
3228         webpage = self._download_webpage(url, video_id,
3229                                          u'Downloading video info JSON',
3230                                          u'unable to download video info JSON')
3231
3232         response = json.loads(webpage)
3233         if type(response) != list:
3234             error_text = response.get('error', 'unknown error')
3235             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3236         info = []
3237         for clip in response:
3238             video_url = clip['video_file_url']
3239             if video_url:
3240                 video_extension = os.path.splitext(video_url)[1][1:]
3241                 video_date = re.sub('-', '', clip['start_time'][:10])
3242                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3243                 video_id = clip['id']
3244                 video_title = clip.get('title', video_id)
3245                 info.append({
3246                     'id': video_id,
3247                     'url': video_url,
3248                     'title': video_title,
3249                     'uploader': clip.get('channel_name', video_uploader_id),
3250                     'uploader_id': video_uploader_id,
3251                     'upload_date': video_date,
3252                     'ext': video_extension,
3253                 })
3254         return (len(response), info)
3255
3256     def _real_extract(self, url):
3257         mobj = re.match(self._VALID_URL, url)
3258         if mobj is None:
3259             raise ExtractorError(u'invalid URL: %s' % url)
3260
3261         api_base = 'http://api.justin.tv'
3262         paged = False
3263         if mobj.group('channelid'):
3264             paged = True
3265             video_id = mobj.group('channelid')
3266             api = api_base + '/channel/archives/%s.json' % video_id
3267         elif mobj.group('chapterid'):
3268             chapter_id = mobj.group('chapterid')
3269
3270             webpage = self._download_webpage(url, chapter_id)
3271             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3272             if not m:
3273                 raise ExtractorError(u'Cannot find archive of a chapter')
3274             archive_id = m.group(1)
3275
3276             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3277             chapter_info_xml = self._download_webpage(api, chapter_id,
3278                                              note=u'Downloading chapter information',
3279                                              errnote=u'Chapter information download failed')
3280             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3281             for a in doc.findall('.//archive'):
3282                 if archive_id == a.find('./id').text:
3283                     break
3284             else:
3285                 raise ExtractorError(u'Could not find chapter in chapter information')
3286
3287             video_url = a.find('./video_file_url').text
3288             video_ext = video_url.rpartition('.')[2] or u'flv'
3289
3290             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3291             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3292                                    note='Downloading chapter metadata',
3293                                    errnote='Download of chapter metadata failed')
3294             chapter_info = json.loads(chapter_info_json)
3295
3296             bracket_start = int(doc.find('.//bracket_start').text)
3297             bracket_end = int(doc.find('.//bracket_end').text)
3298
3299             # TODO determine start (and probably fix up file)
3300             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3301             #video_url += u'?start=' + TODO:start_timestamp
3302             # bracket_start is 13290, but we want 51670615
3303             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3304                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3305
3306             info = {
3307                 'id': u'c' + chapter_id,
3308                 'url': video_url,
3309                 'ext': video_ext,
3310                 'title': chapter_info['title'],
3311                 'thumbnail': chapter_info['preview'],
3312                 'description': chapter_info['description'],
3313                 'uploader': chapter_info['channel']['display_name'],
3314                 'uploader_id': chapter_info['channel']['name'],
3315             }
3316             return [info]
3317         else:
3318             video_id = mobj.group('videoid')
3319             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3320
3321         self.report_extraction(video_id)
3322
3323         info = []
3324         offset = 0
3325         limit = self._JUSTIN_PAGE_LIMIT
3326         while True:
3327             if paged:
3328                 self.report_download_page(video_id, offset)
3329             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3330             page_count, page_info = self._parse_page(page_url, video_id)
3331             info.extend(page_info)
3332             if not paged or page_count != limit:
3333                 break
3334             offset += limit
3335         return info
3336
3337 class FunnyOrDieIE(InfoExtractor):
3338     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3339
3340     def _real_extract(self, url):
3341         mobj = re.match(self._VALID_URL, url)
3342         if mobj is None:
3343             raise ExtractorError(u'invalid URL: %s' % url)
3344
3345         video_id = mobj.group('id')
3346         webpage = self._download_webpage(url, video_id)
3347
3348         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3349             webpage, u'video URL', flags=re.DOTALL)
3350
3351         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3352             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3353
3354         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3355             webpage, u'description', fatal=False, flags=re.DOTALL)
3356
3357         info = {
3358             'id': video_id,
3359             'url': video_url,
3360             'ext': 'mp4',
3361             'title': title,
3362             'description': video_description,
3363         }
3364         return [info]
3365
3366 class SteamIE(InfoExtractor):
3367     _VALID_URL = r"""http://store\.steampowered\.com/
3368                 (agecheck/)?
3369                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3370                 (?P<gameID>\d+)/?
3371                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3372                 """
3373     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3374     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3375
3376     @classmethod
3377     def suitable(cls, url):
3378         """Receives a URL and returns True if suitable for this IE."""
3379         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3380
3381     def _real_extract(self, url):
3382         m = re.match(self._VALID_URL, url, re.VERBOSE)
3383         gameID = m.group('gameID')
3384
3385         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3386         webpage = self._download_webpage(videourl, gameID)
3387
3388         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3389             videourl = self._AGECHECK_TEMPLATE % gameID
3390             self.report_age_confirmation()
3391             webpage = self._download_webpage(videourl, gameID)
3392
3393         self.report_extraction(gameID)
3394         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3395                                              webpage, 'game title')
3396
3397         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3398         mweb = re.finditer(urlRE, webpage)
3399         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3400         titles = re.finditer(namesRE, webpage)
3401         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3402         thumbs = re.finditer(thumbsRE, webpage)
3403         videos = []
3404         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3405             video_id = vid.group('videoID')
3406             title = vtitle.group('videoName')
3407             video_url = vid.group('videoURL')
3408             video_thumb = thumb.group('thumbnail')
3409             if not video_url:
3410                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3411             info = {
3412                 'id':video_id,
3413                 'url':video_url,
3414                 'ext': 'flv',
3415                 'title': unescapeHTML(title),
3416                 'thumbnail': video_thumb
3417                   }
3418             videos.append(info)
3419         return [self.playlist_result(videos, gameID, game_title)]
3420
3421 class UstreamIE(InfoExtractor):
3422     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3423     IE_NAME = u'ustream'
3424
3425     def _real_extract(self, url):
3426         m = re.match(self._VALID_URL, url)
3427         video_id = m.group('videoID')
3428
3429         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3430         webpage = self._download_webpage(url, video_id)
3431
3432         self.report_extraction(video_id)
3433
3434         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3435             webpage, u'title')
3436
3437         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3438             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3439
3440         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3441             webpage, u'thumbnail', fatal=False)
3442
3443         info = {
3444                 'id': video_id,
3445                 'url': video_url,
3446                 'ext': 'flv',
3447                 'title': video_title,
3448                 'uploader': uploader,
3449                 'thumbnail': thumbnail,
3450                }
3451         return info
3452
3453 class WorldStarHipHopIE(InfoExtractor):
3454     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3455     IE_NAME = u'WorldStarHipHop'
3456
3457     def _real_extract(self, url):
3458         m = re.match(self._VALID_URL, url)
3459         video_id = m.group('id')
3460
3461         webpage_src = self._download_webpage(url, video_id)
3462
3463         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3464             webpage_src, u'video URL')
3465
3466         if 'mp4' in video_url:
3467             ext = 'mp4'
3468         else:
3469             ext = 'flv'
3470
3471         video_title = self._html_search_regex(r"<title>(.*)</title>",
3472             webpage_src, u'title')
3473
3474         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3475         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3476             webpage_src, u'thumbnail', fatal=False)
3477
3478         if not thumbnail:
3479             _title = r"""candytitles.*>(.*)</span>"""
3480             mobj = re.search(_title, webpage_src)
3481             if mobj is not None:
3482                 video_title = mobj.group(1)
3483
3484         results = [{
3485                     'id': video_id,
3486                     'url' : video_url,
3487                     'title' : video_title,
3488                     'thumbnail' : thumbnail,
3489                     'ext' : ext,
3490                     }]
3491         return results
3492
3493 class RBMARadioIE(InfoExtractor):
3494     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3495
3496     def _real_extract(self, url):
3497         m = re.match(self._VALID_URL, url)
3498         video_id = m.group('videoID')
3499
3500         webpage = self._download_webpage(url, video_id)
3501
3502         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3503             webpage, u'json data', flags=re.MULTILINE)
3504
3505         try:
3506             data = json.loads(json_data)
3507         except ValueError as e:
3508             raise ExtractorError(u'Invalid JSON: ' + str(e))
3509
3510         video_url = data['akamai_url'] + '&cbr=256'
3511         url_parts = compat_urllib_parse_urlparse(video_url)
3512         video_ext = url_parts.path.rpartition('.')[2]
3513         info = {
3514                 'id': video_id,
3515                 'url': video_url,
3516                 'ext': video_ext,
3517                 'title': data['title'],
3518                 'description': data.get('teaser_text'),
3519                 'location': data.get('country_of_origin'),
3520                 'uploader': data.get('host', {}).get('name'),
3521                 'uploader_id': data.get('host', {}).get('slug'),
3522                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3523                 'duration': data.get('duration'),
3524         }
3525         return [info]
3526
3527
3528 class YouPornIE(InfoExtractor):
3529     """Information extractor for youporn.com."""
3530     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3531
3532     def _print_formats(self, formats):
3533         """Print all available formats"""
3534         print(u'Available formats:')
3535         print(u'ext\t\tformat')
3536         print(u'---------------------------------')
3537         for format in formats:
3538             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3539
3540     def _specific(self, req_format, formats):
3541         for x in formats:
3542             if(x["format"]==req_format):
3543                 return x
3544         return None
3545
3546     def _real_extract(self, url):
3547         mobj = re.match(self._VALID_URL, url)
3548         if mobj is None:
3549             raise ExtractorError(u'Invalid URL: %s' % url)
3550         video_id = mobj.group('videoid')
3551
3552         req = compat_urllib_request.Request(url)
3553         req.add_header('Cookie', 'age_verified=1')
3554         webpage = self._download_webpage(req, video_id)
3555
3556         # Get JSON parameters
3557         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3558         try:
3559             params = json.loads(json_params)
3560         except:
3561             raise ExtractorError(u'Invalid JSON')
3562
3563         self.report_extraction(video_id)
3564         try:
3565             video_title = params['title']
3566             upload_date = unified_strdate(params['release_date_f'])
3567             video_description = params['description']
3568             video_uploader = params['submitted_by']
3569             thumbnail = params['thumbnails'][0]['image']
3570         except KeyError:
3571             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3572
3573         # Get all of the formats available
3574         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3575         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3576             webpage, u'download list').strip()
3577
3578         # Get all of the links from the page
3579         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3580         links = re.findall(LINK_RE, download_list_html)
3581         if(len(links) == 0):
3582             raise ExtractorError(u'ERROR: no known formats available for video')
3583
3584         self.to_screen(u'Links found: %d' % len(links))
3585
3586         formats = []
3587         for link in links:
3588
3589             # A link looks like this:
3590             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3591             # A path looks like this:
3592             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3593             video_url = unescapeHTML( link )
3594             path = compat_urllib_parse_urlparse( video_url ).path
3595             extension = os.path.splitext( path )[1][1:]
3596             format = path.split('/')[4].split('_')[:2]
3597             size = format[0]
3598             bitrate = format[1]
3599             format = "-".join( format )
3600             # title = u'%s-%s-%s' % (video_title, size, bitrate)
3601
3602             formats.append({
3603                 'id': video_id,
3604                 'url': video_url,
3605                 'uploader': video_uploader,
3606                 'upload_date': upload_date,
3607                 'title': video_title,
3608                 'ext': extension,
3609                 'format': format,
3610                 'thumbnail': thumbnail,
3611                 'description': video_description
3612             })
3613
3614         if self._downloader.params.get('listformats', None):
3615             self._print_formats(formats)
3616             return
3617
3618         req_format = self._downloader.params.get('format', None)
3619         self.to_screen(u'Format: %s' % req_format)
3620
3621         if req_format is None or req_format == 'best':
3622             return [formats[0]]
3623         elif req_format == 'worst':
3624             return [formats[-1]]
3625         elif req_format in ('-1', 'all'):
3626             return formats
3627         else:
3628             format = self._specific( req_format, formats )
3629             if result is None:
3630                 raise ExtractorError(u'Requested format not available')
3631             return [format]
3632
3633
3634
3635 class PornotubeIE(InfoExtractor):
3636     """Information extractor for pornotube.com."""
3637     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3638
3639     def _real_extract(self, url):
3640         mobj = re.match(self._VALID_URL, url)
3641         if mobj is None:
3642             raise ExtractorError(u'Invalid URL: %s' % url)
3643
3644         video_id = mobj.group('videoid')
3645         video_title = mobj.group('title')
3646
3647         # Get webpage content
3648         webpage = self._download_webpage(url, video_id)
3649
3650         # Get the video URL
3651         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3652         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3653         video_url = compat_urllib_parse.unquote(video_url)
3654
3655         #Get the uploaded date
3656         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3657         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3658         if upload_date: upload_date = unified_strdate(upload_date)
3659
3660         info = {'id': video_id,
3661                 'url': video_url,
3662                 'uploader': None,
3663                 'upload_date': upload_date,
3664                 'title': video_title,
3665                 'ext': 'flv',
3666                 'format': 'flv'}
3667
3668         return [info]
3669
3670 class YouJizzIE(InfoExtractor):
3671     """Information extractor for youjizz.com."""
3672     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3673
3674     def _real_extract(self, url):
3675         mobj = re.match(self._VALID_URL, url)
3676         if mobj is None:
3677             raise ExtractorError(u'Invalid URL: %s' % url)
3678
3679         video_id = mobj.group('videoid')
3680
3681         # Get webpage content
3682         webpage = self._download_webpage(url, video_id)
3683
3684         # Get the video title
3685         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3686             webpage, u'title').strip()
3687
3688         # Get the embed page
3689         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3690         if result is None:
3691             raise ExtractorError(u'ERROR: unable to extract embed page')
3692
3693         embed_page_url = result.group(0).strip()
3694         video_id = result.group('videoid')
3695
3696         webpage = self._download_webpage(embed_page_url, video_id)
3697
3698         # Get the video URL
3699         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3700             webpage, u'video URL')
3701
3702         info = {'id': video_id,
3703                 'url': video_url,
3704                 'title': video_title,
3705                 'ext': 'flv',
3706                 'format': 'flv',
3707                 'player_url': embed_page_url}
3708
3709         return [info]
3710
3711 class EightTracksIE(InfoExtractor):
3712     IE_NAME = '8tracks'
3713     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3714
3715     def _real_extract(self, url):
3716         mobj = re.match(self._VALID_URL, url)
3717         if mobj is None:
3718             raise ExtractorError(u'Invalid URL: %s' % url)
3719         playlist_id = mobj.group('id')
3720
3721         webpage = self._download_webpage(url, playlist_id)
3722
3723         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3724         data = json.loads(json_like)
3725
3726         session = str(random.randint(0, 1000000000))
3727         mix_id = data['id']
3728         track_count = data['tracks_count']
3729         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3730         next_url = first_url
3731         res = []
3732         for i in itertools.count():
3733             api_json = self._download_webpage(next_url, playlist_id,
3734                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3735                 errnote=u'Failed to download song information')
3736             api_data = json.loads(api_json)
3737             track_data = api_data[u'set']['track']
3738             info = {
3739                 'id': track_data['id'],
3740                 'url': track_data['track_file_stream_url'],
3741                 'title': track_data['performer'] + u' - ' + track_data['name'],
3742                 'raw_title': track_data['name'],
3743                 'uploader_id': data['user']['login'],
3744                 'ext': 'm4a',
3745             }
3746             res.append(info)
3747             if api_data['set']['at_last_track']:
3748                 break
3749             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3750         return res
3751
3752 class KeekIE(InfoExtractor):
3753     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3754     IE_NAME = u'keek'
3755
3756     def _real_extract(self, url):
3757         m = re.match(self._VALID_URL, url)
3758         video_id = m.group('videoID')
3759
3760         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3761         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3762         webpage = self._download_webpage(url, video_id)
3763
3764         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3765             webpage, u'title')
3766
3767         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3768             webpage, u'uploader', fatal=False)
3769
3770         info = {
3771                 'id': video_id,
3772                 'url': video_url,
3773                 'ext': 'mp4',
3774                 'title': video_title,
3775                 'thumbnail': thumbnail,
3776                 'uploader': uploader
3777         }
3778         return [info]
3779
3780 class TEDIE(InfoExtractor):
3781     _VALID_URL=r'''http://www\.ted\.com/
3782                    (
3783                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3784                         |
3785                         ((?P<type_talk>talks)) # We have a simple talk
3786                    )
3787                    (/lang/(.*?))? # The url may contain the language
3788                    /(?P<name>\w+) # Here goes the name and then ".html"
3789                    '''
3790
3791     @classmethod
3792     def suitable(cls, url):
3793         """Receives a URL and returns True if suitable for this IE."""
3794         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3795
3796     def _real_extract(self, url):
3797         m=re.match(self._VALID_URL, url, re.VERBOSE)
3798         if m.group('type_talk'):
3799             return [self._talk_info(url)]
3800         else :
3801             playlist_id=m.group('playlist_id')
3802             name=m.group('name')
3803             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3804             return [self._playlist_videos_info(url,name,playlist_id)]
3805
3806     def _playlist_videos_info(self,url,name,playlist_id=0):
3807         '''Returns the videos of the playlist'''
3808         video_RE=r'''
3809                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3810                      ([.\s]*?)data-playlist_item_id="(\d+)"
3811                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3812                      '''
3813         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3814         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3815         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3816         m_names=re.finditer(video_name_RE,webpage)
3817
3818         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3819                                                  webpage, 'playlist title')
3820
3821         playlist_entries = []
3822         for m_video, m_name in zip(m_videos,m_names):
3823             video_id=m_video.group('video_id')
3824             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3825             playlist_entries.append(self.url_result(talk_url, 'TED'))
3826         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3827
3828     def _talk_info(self, url, video_id=0):
3829         """Return the video for the talk in the url"""
3830         m = re.match(self._VALID_URL, url,re.VERBOSE)
3831         video_name = m.group('name')
3832         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3833         self.report_extraction(video_name)
3834         # If the url includes the language we get the title translated
3835         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3836                                         webpage, 'title')
3837         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3838                                     webpage, 'json data')
3839         info = json.loads(json_data)
3840         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3841                                        webpage, 'description', flags = re.DOTALL)
3842
3843         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3844                                        webpage, 'thumbnail')
3845         info = {
3846                 'id': info['id'],
3847                 'url': info['htmlStreams'][-1]['file'],
3848                 'ext': 'mp4',
3849                 'title': title,
3850                 'thumbnail': thumbnail,
3851                 'description': desc,
3852                 }
3853         return info
3854
3855 class MySpassIE(InfoExtractor):
3856     _VALID_URL = r'http://www.myspass.de/.*'
3857
3858     def _real_extract(self, url):
3859         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3860
3861         # video id is the last path element of the URL
3862         # usually there is a trailing slash, so also try the second but last
3863         url_path = compat_urllib_parse_urlparse(url).path
3864         url_parent_path, video_id = os.path.split(url_path)
3865         if not video_id:
3866             _, video_id = os.path.split(url_parent_path)
3867
3868         # get metadata
3869         metadata_url = META_DATA_URL_TEMPLATE % video_id
3870         metadata_text = self._download_webpage(metadata_url, video_id)
3871         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3872
3873         # extract values from metadata
3874         url_flv_el = metadata.find('url_flv')
3875         if url_flv_el is None:
3876             raise ExtractorError(u'Unable to extract download url')
3877         video_url = url_flv_el.text
3878         extension = os.path.splitext(video_url)[1][1:]
3879         title_el = metadata.find('title')
3880         if title_el is None:
3881             raise ExtractorError(u'Unable to extract title')
3882         title = title_el.text
3883         format_id_el = metadata.find('format_id')
3884         if format_id_el is None:
3885             format = ext
3886         else:
3887             format = format_id_el.text
3888         description_el = metadata.find('description')
3889         if description_el is not None:
3890             description = description_el.text
3891         else:
3892             description = None
3893         imagePreview_el = metadata.find('imagePreview')
3894         if imagePreview_el is not None:
3895             thumbnail = imagePreview_el.text
3896         else:
3897             thumbnail = None
3898         info = {
3899             'id': video_id,
3900             'url': video_url,
3901             'title': title,
3902             'ext': extension,
3903             'format': format,
3904             'thumbnail': thumbnail,
3905             'description': description
3906         }
3907         return [info]
3908
3909 class SpiegelIE(InfoExtractor):
3910     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3911
3912     def _real_extract(self, url):
3913         m = re.match(self._VALID_URL, url)
3914         video_id = m.group('videoID')
3915
3916         webpage = self._download_webpage(url, video_id)
3917
3918         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3919             webpage, u'title')
3920
3921         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3922         xml_code = self._download_webpage(xml_url, video_id,
3923                     note=u'Downloading XML', errnote=u'Failed to download XML')
3924
3925         idoc = xml.etree.ElementTree.fromstring(xml_code)
3926         last_type = idoc[-1]
3927         filename = last_type.findall('./filename')[0].text
3928         duration = float(last_type.findall('./duration')[0].text)
3929
3930         video_url = 'http://video2.spiegel.de/flash/' + filename
3931         video_ext = filename.rpartition('.')[2]
3932         info = {
3933             'id': video_id,
3934             'url': video_url,
3935             'ext': video_ext,
3936             'title': video_title,
3937             'duration': duration,
3938         }
3939         return [info]
3940
3941 class LiveLeakIE(InfoExtractor):
3942
3943     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3944     IE_NAME = u'liveleak'
3945
3946     def _real_extract(self, url):
3947         mobj = re.match(self._VALID_URL, url)
3948         if mobj is None:
3949             raise ExtractorError(u'Invalid URL: %s' % url)
3950
3951         video_id = mobj.group('video_id')
3952
3953         webpage = self._download_webpage(url, video_id)
3954
3955         video_url = self._search_regex(r'file: "(.*?)",',
3956             webpage, u'video URL')
3957
3958         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3959             webpage, u'title').replace('LiveLeak.com -', '').strip()
3960
3961         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3962             webpage, u'description', fatal=False)
3963
3964         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3965             webpage, u'uploader', fatal=False)
3966
3967         info = {
3968             'id':  video_id,
3969             'url': video_url,
3970             'ext': 'mp4',
3971             'title': video_title,
3972             'description': video_description,
3973             'uploader': video_uploader
3974         }
3975
3976         return [info]
3977
3978 class ARDIE(InfoExtractor):
3979     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3980     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3981     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3982
3983     def _real_extract(self, url):
3984         # determine video id from url
3985         m = re.match(self._VALID_URL, url)
3986
3987         numid = re.search(r'documentId=([0-9]+)', url)
3988         if numid:
3989             video_id = numid.group(1)
3990         else:
3991             video_id = m.group('video_id')
3992
3993         # determine title and media streams from webpage
3994         html = self._download_webpage(url, video_id)
3995         title = re.search(self._TITLE, html).group('title')
3996         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3997         if not streams:
3998             assert '"fsk"' in html
3999             raise ExtractorError(u'This video is only available after 8:00 pm')
4000
4001         # choose default media type and highest quality for now
4002         stream = max([s for s in streams if int(s["media_type"]) == 0],
4003                      key=lambda s: int(s["quality"]))
4004
4005         # there's two possibilities: RTMP stream or HTTP download
4006         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4007         if stream['rtmp_url']:
4008             self.to_screen(u'RTMP download detected')
4009             assert stream['video_url'].startswith('mp4:')
4010             info["url"] = stream["rtmp_url"]
4011             info["play_path"] = stream['video_url']
4012         else:
4013             assert stream["video_url"].endswith('.mp4')
4014             info["url"] = stream["video_url"]
4015         return [info]
4016
4017 class ZDFIE(InfoExtractor):
4018     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4019     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4020     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4021     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4022     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4023
4024     def _real_extract(self, url):
4025         mobj = re.match(self._VALID_URL, url)
4026         if mobj is None:
4027             raise ExtractorError(u'Invalid URL: %s' % url)
4028         video_id = mobj.group('video_id')
4029
4030         html = self._download_webpage(url, video_id)
4031         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4032         if streams is None:
4033             raise ExtractorError(u'No media url found.')
4034
4035         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4036         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4037         # choose first/default media type and highest quality for now
4038         for s in streams:        #find 300 - dsl1000mbit
4039             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4040                 stream_=s
4041                 break
4042         for s in streams:        #find veryhigh - dsl2000mbit
4043             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4044                 stream_=s
4045                 break
4046         if stream_ is None:
4047             raise ExtractorError(u'No stream found.')
4048
4049         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4050
4051         self.report_extraction(video_id)
4052         mobj = re.search(self._TITLE, html)
4053         if mobj is None:
4054             raise ExtractorError(u'Cannot extract title')
4055         title = unescapeHTML(mobj.group('title'))
4056
4057         mobj = re.search(self._MMS_STREAM, media_link)
4058         if mobj is None:
4059             mobj = re.search(self._RTSP_STREAM, media_link)
4060             if mobj is None:
4061                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4062         mms_url = mobj.group('video_url')
4063
4064         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4065         if mobj is None:
4066             raise ExtractorError(u'Cannot extract extention')
4067         ext = mobj.group('ext')
4068
4069         return [{'id': video_id,
4070                  'url': mms_url,
4071                  'title': title,
4072                  'ext': ext
4073                  }]
4074
4075 class TumblrIE(InfoExtractor):
4076     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4077
4078     def _real_extract(self, url):
4079         m_url = re.match(self._VALID_URL, url)
4080         video_id = m_url.group('id')
4081         blog = m_url.group('blog_name')
4082
4083         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4084         webpage = self._download_webpage(url, video_id)
4085
4086         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4087         video = re.search(re_video, webpage)
4088         if video is None:
4089            raise ExtractorError(u'Unable to extract video')
4090         video_url = video.group('video_url')
4091         ext = video.group('ext')
4092
4093         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4094             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4095         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4096
4097         # The only place where you can get a title, it's not complete,
4098         # but searching in other places doesn't work for all videos
4099         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4100             webpage, u'title', flags=re.DOTALL)
4101
4102         return [{'id': video_id,
4103                  'url': video_url,
4104                  'title': video_title,
4105                  'thumbnail': video_thumbnail,
4106                  'ext': ext
4107                  }]
4108
4109 class BandcampIE(InfoExtractor):
4110     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4111
4112     def _real_extract(self, url):
4113         mobj = re.match(self._VALID_URL, url)
4114         title = mobj.group('title')
4115         webpage = self._download_webpage(url, title)
4116         # We get the link to the free download page
4117         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4118         if m_download is None:
4119             raise ExtractorError(u'No free songs found')
4120
4121         download_link = m_download.group(1)
4122         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4123                        webpage, re.MULTILINE|re.DOTALL).group('id')
4124
4125         download_webpage = self._download_webpage(download_link, id,
4126                                                   'Downloading free downloads page')
4127         # We get the dictionary of the track from some javascrip code
4128         info = re.search(r'items: (.*?),$',
4129                          download_webpage, re.MULTILINE).group(1)
4130         info = json.loads(info)[0]
4131         # We pick mp3-320 for now, until format selection can be easily implemented.
4132         mp3_info = info[u'downloads'][u'mp3-320']
4133         # If we try to use this url it says the link has expired
4134         initial_url = mp3_info[u'url']
4135         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4136         m_url = re.match(re_url, initial_url)
4137         #We build the url we will use to get the final track url
4138         # This url is build in Bandcamp in the script download_bunde_*.js
4139         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4140         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4141         # If we could correctly generate the .rand field the url would be
4142         #in the "download_url" key
4143         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4144
4145         track_info = {'id':id,
4146                       'title' : info[u'title'],
4147                       'ext' :   'mp3',
4148                       'url' :   final_url,
4149                       'thumbnail' : info[u'thumb_url'],
4150                       'uploader' :  info[u'artist']
4151                       }
4152
4153         return [track_info]
4154
4155 class RedTubeIE(InfoExtractor):
4156     """Information Extractor for redtube"""
4157     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4158
4159     def _real_extract(self,url):
4160         mobj = re.match(self._VALID_URL, url)
4161         if mobj is None:
4162             raise ExtractorError(u'Invalid URL: %s' % url)
4163
4164         video_id = mobj.group('id')
4165         video_extension = 'mp4'
4166         webpage = self._download_webpage(url, video_id)
4167
4168         self.report_extraction(video_id)
4169
4170         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4171             webpage, u'video URL')
4172
4173         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4174             webpage, u'title')
4175
4176         return [{
4177             'id':       video_id,
4178             'url':      video_url,
4179             'ext':      video_extension,
4180             'title':    video_title,
4181         }]
4182
4183 class InaIE(InfoExtractor):
4184     """Information Extractor for Ina.fr"""
4185     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4186
4187     def _real_extract(self,url):
4188         mobj = re.match(self._VALID_URL, url)
4189
4190         video_id = mobj.group('id')
4191         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4192         video_extension = 'mp4'
4193         webpage = self._download_webpage(mrss_url, video_id)
4194
4195         self.report_extraction(video_id)
4196
4197         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4198             webpage, u'video URL')
4199
4200         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4201             webpage, u'title')
4202
4203         return [{
4204             'id':       video_id,
4205             'url':      video_url,
4206             'ext':      video_extension,
4207             'title':    video_title,
4208         }]
4209
4210 class HowcastIE(InfoExtractor):
4211     """Information Extractor for Howcast.com"""
4212     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4213
4214     def _real_extract(self, url):
4215         mobj = re.match(self._VALID_URL, url)
4216
4217         video_id = mobj.group('id')
4218         webpage_url = 'http://www.howcast.com/videos/' + video_id
4219         webpage = self._download_webpage(webpage_url, video_id)
4220
4221         self.report_extraction(video_id)
4222
4223         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4224             webpage, u'video URL')
4225
4226         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4227             webpage, u'title')
4228
4229         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4230             webpage, u'description', fatal=False)
4231
4232         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4233             webpage, u'thumbnail', fatal=False)
4234
4235         return [{
4236             'id':       video_id,
4237             'url':      video_url,
4238             'ext':      'mp4',
4239             'title':    video_title,
4240             'description': video_description,
4241             'thumbnail': thumbnail,
4242         }]
4243
4244 class VineIE(InfoExtractor):
4245     """Information Extractor for Vine.co"""
4246     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4247
4248     def _real_extract(self, url):
4249         mobj = re.match(self._VALID_URL, url)
4250
4251         video_id = mobj.group('id')
4252         webpage_url = 'https://vine.co/v/' + video_id
4253         webpage = self._download_webpage(webpage_url, video_id)
4254
4255         self.report_extraction(video_id)
4256
4257         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4258             webpage, u'video URL')
4259
4260         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4261             webpage, u'title')
4262
4263         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4264             webpage, u'thumbnail', fatal=False)
4265
4266         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4267             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4268
4269         return [{
4270             'id':        video_id,
4271             'url':       video_url,
4272             'ext':       'mp4',
4273             'title':     video_title,
4274             'thumbnail': thumbnail,
4275             'uploader':  uploader,
4276         }]
4277
4278 class FlickrIE(InfoExtractor):
4279     """Information Extractor for Flickr videos"""
4280     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4281
4282     def _real_extract(self, url):
4283         mobj = re.match(self._VALID_URL, url)
4284
4285         video_id = mobj.group('id')
4286         video_uploader_id = mobj.group('uploader_id')
4287         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4288         webpage = self._download_webpage(webpage_url, video_id)
4289
4290         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4291
4292         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4293         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4294
4295         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4296             first_xml, u'node_id')
4297
4298         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4299         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4300
4301         self.report_extraction(video_id)
4302
4303         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4304         if mobj is None:
4305             raise ExtractorError(u'Unable to extract video url')
4306         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4307
4308         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4309             webpage, u'video title')
4310
4311         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4312             webpage, u'description', fatal=False)
4313
4314         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4315             webpage, u'thumbnail', fatal=False)
4316
4317         return [{
4318             'id':          video_id,
4319             'url':         video_url,
4320             'ext':         'mp4',
4321             'title':       video_title,
4322             'description': video_description,
4323             'thumbnail':   thumbnail,
4324             'uploader_id': video_uploader_id,
4325         }]
4326
4327 class TeamcocoIE(InfoExtractor):
4328     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4329
4330     def _real_extract(self, url):
4331         mobj = re.match(self._VALID_URL, url)
4332         if mobj is None:
4333             raise ExtractorError(u'Invalid URL: %s' % url)
4334         url_title = mobj.group('url_title')
4335         webpage = self._download_webpage(url, url_title)
4336
4337         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4338             webpage, u'video id')
4339
4340         self.report_extraction(video_id)
4341
4342         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4343             webpage, u'title')
4344
4345         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4346             webpage, u'thumbnail', fatal=False)
4347
4348         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4349             webpage, u'description', fatal=False)
4350
4351         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4352         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4353
4354         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4355             data, u'video URL')
4356
4357         return [{
4358             'id':          video_id,
4359             'url':         video_url,
4360             'ext':         'mp4',
4361             'title':       video_title,
4362             'thumbnail':   thumbnail,
4363             'description': video_description,
4364         }]
4365
4366 class XHamsterIE(InfoExtractor):
4367     """Information Extractor for xHamster"""
4368     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4369
4370     def _real_extract(self,url):
4371         mobj = re.match(self._VALID_URL, url)
4372
4373         video_id = mobj.group('id')
4374         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4375         webpage = self._download_webpage(mrss_url, video_id)
4376
4377         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4378         if mobj is None:
4379             raise ExtractorError(u'Unable to extract media URL')
4380         if len(mobj.group('server')) == 0:
4381             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4382         else:
4383             video_url = mobj.group('server')+'/key='+mobj.group('file')
4384         video_extension = video_url.split('.')[-1]
4385
4386         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4387             webpage, u'title')
4388
4389         # Can't see the description anywhere in the UI
4390         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4391         #     webpage, u'description', fatal=False)
4392         # if video_description: video_description = unescapeHTML(video_description)
4393
4394         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4395         if mobj:
4396             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4397         else:
4398             video_upload_date = None
4399             self._downloader.report_warning(u'Unable to extract upload date')
4400
4401         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4402             webpage, u'uploader id', default=u'anonymous')
4403
4404         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4405             webpage, u'thumbnail', fatal=False)
4406
4407         return [{
4408             'id':       video_id,
4409             'url':      video_url,
4410             'ext':      video_extension,
4411             'title':    video_title,
4412             # 'description': video_description,
4413             'upload_date': video_upload_date,
4414             'uploader_id': video_uploader_id,
4415             'thumbnail': video_thumbnail
4416         }]
4417
4418 class HypemIE(InfoExtractor):
4419     """Information Extractor for hypem"""
4420     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4421
4422     def _real_extract(self, url):
4423         mobj = re.match(self._VALID_URL, url)
4424         if mobj is None:
4425             raise ExtractorError(u'Invalid URL: %s' % url)
4426         track_id = mobj.group(1)
4427
4428         data = { 'ax': 1, 'ts': time.time() }
4429         data_encoded = compat_urllib_parse.urlencode(data)
4430         complete_url = url + "?" + data_encoded
4431         request = compat_urllib_request.Request(complete_url)
4432         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4433         cookie = urlh.headers.get('Set-Cookie', '')
4434
4435         self.report_extraction(track_id)
4436
4437         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4438             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4439         try:
4440             track_list = json.loads(html_tracks)
4441             track = track_list[u'tracks'][0]
4442         except ValueError:
4443             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4444
4445         key = track[u"key"]
4446         track_id = track[u"id"]
4447         artist = track[u"artist"]
4448         title = track[u"song"]
4449
4450         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4451         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4452         request.add_header('cookie', cookie)
4453         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4454         try:
4455             song_data = json.loads(song_data_json)
4456         except ValueError:
4457             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4458         final_url = song_data[u"url"]
4459
4460         return [{
4461             'id':       track_id,
4462             'url':      final_url,
4463             'ext':      "mp3",
4464             'title':    title,
4465             'artist':   artist,
4466         }]
4467
4468 class Vbox7IE(InfoExtractor):
4469     """Information Extractor for Vbox7"""
4470     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4471
4472     def _real_extract(self,url):
4473         mobj = re.match(self._VALID_URL, url)
4474         if mobj is None:
4475             raise ExtractorError(u'Invalid URL: %s' % url)
4476         video_id = mobj.group(1)
4477
4478         redirect_page, urlh = self._download_webpage_handle(url, video_id)
4479         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4480         redirect_url = urlh.geturl() + new_location
4481         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4482
4483         title = self._html_search_regex(r'<title>(.*)</title>',
4484             webpage, u'title').split('/')[0].strip()
4485
4486         ext = "flv"
4487         info_url = "http://vbox7.com/play/magare.do"
4488         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4489         info_request = compat_urllib_request.Request(info_url, data)
4490         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4491         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4492         if info_response is None:
4493             raise ExtractorError(u'Unable to extract the media url')
4494         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4495
4496         return [{
4497             'id':        video_id,
4498             'url':       final_url,
4499             'ext':       ext,
4500             'title':     title,
4501             'thumbnail': thumbnail_url,
4502         }]
4503
4504 class GametrailersIE(InfoExtractor):
4505     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4506
4507     def _real_extract(self, url):
4508         mobj = re.match(self._VALID_URL, url)
4509         if mobj is None:
4510             raise ExtractorError(u'Invalid URL: %s' % url)
4511         video_id = mobj.group('id')
4512         video_type = mobj.group('type')
4513         webpage = self._download_webpage(url, video_id)
4514         if video_type == 'full-episodes':
4515             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4516         else:
4517             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4518         mgid = self._search_regex(mgid_re, webpage, u'mgid')
4519         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4520
4521         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4522                                            video_id, u'Downloading video info')
4523         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4524                                                video_id, u'Downloading video urls info')
4525
4526         self.report_extraction(video_id)
4527         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4528                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4529                       <image>.*
4530                         <url>(?P<thumb>.*?)</url>.*
4531                       </image>'''
4532
4533         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4534         if m_info is None:
4535             raise ExtractorError(u'Unable to extract video info')
4536         video_title = m_info.group('title')
4537         video_description = m_info.group('description')
4538         video_thumb = m_info.group('thumb')
4539
4540         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4541         if m_urls is None or len(m_urls) == 0:
4542             raise ExtractError(u'Unable to extrat video url')
4543         # They are sorted from worst to best quality
4544         video_url = m_urls[-1].group('url')
4545
4546         return {'url':         video_url,
4547                 'id':          video_id,
4548                 'title':       video_title,
4549                 # Videos are actually flv not mp4
4550                 'ext':         'flv',
4551                 'thumbnail':   video_thumb,
4552                 'description': video_description,
4553                 }
4554
4555 class StatigrIE(InfoExtractor):
4556     _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)'
4557
4558     def _real_extract(self, url):
4559         mobj = re.match(self._VALID_URL, url)
4560         if mobj is None:
4561             raise ExtractorError(u'Invalid URL: %s' % url)
4562         video_id = mobj.group(1)
4563         webpage = self._download_webpage(url, video_id)
4564         video_url = re.search(r'<meta property="og:video:secure_url" content="(.+?)">',webpage).group(1)
4565         thumbnail_url = re.search(r'<meta property="og:image" content="(.+?)" />',webpage).group(1)
4566         title = (re.search(r'<title>(.+?)</title>',webpage).group(1)).strip("| Statigram")
4567         uploader = re.search(r'@(.+) \(Videos\)',title).group(1)
4568         ext = "mp4"
4569         return [{
4570             'id':        video_id,
4571             'url':       video_url,
4572             'ext':       ext,
4573             'title':     title,
4574             'thumbnail': thumbnail_url,
4575             'uploader' : uploader
4576         }]
4577
4578 def gen_extractors():
4579     """ Return a list of an instance of every supported extractor.
4580     The order does matter; the first extractor matched is the one handling the URL.
4581     """
4582     return [
4583         YoutubePlaylistIE(),
4584         YoutubeChannelIE(),
4585         YoutubeUserIE(),
4586         YoutubeSearchIE(),
4587         YoutubeIE(),
4588         MetacafeIE(),
4589         DailymotionIE(),
4590         GoogleSearchIE(),
4591         PhotobucketIE(),
4592         YahooIE(),
4593         YahooSearchIE(),
4594         DepositFilesIE(),
4595         FacebookIE(),
4596         BlipTVIE(),
4597         BlipTVUserIE(),
4598         VimeoIE(),
4599         MyVideoIE(),
4600         ComedyCentralIE(),
4601         EscapistIE(),
4602         CollegeHumorIE(),
4603         XVideosIE(),
4604         SoundcloudSetIE(),
4605         SoundcloudIE(),
4606         InfoQIE(),
4607         MixcloudIE(),
4608         StanfordOpenClassroomIE(),
4609         MTVIE(),
4610         YoukuIE(),
4611         XNXXIE(),
4612         YouJizzIE(),
4613         PornotubeIE(),
4614         YouPornIE(),
4615         GooglePlusIE(),
4616         ArteTvIE(),
4617         NBAIE(),
4618         WorldStarHipHopIE(),
4619         JustinTVIE(),
4620         FunnyOrDieIE(),
4621         SteamIE(),
4622         UstreamIE(),
4623         RBMARadioIE(),
4624         EightTracksIE(),
4625         KeekIE(),
4626         TEDIE(),
4627         MySpassIE(),
4628         SpiegelIE(),
4629         LiveLeakIE(),
4630         ARDIE(),
4631         ZDFIE(),
4632         TumblrIE(),
4633         BandcampIE(),
4634         RedTubeIE(),
4635         InaIE(),
4636         HowcastIE(),
4637         VineIE(),
4638         FlickrIE(),
4639         TeamcocoIE(),
4640         XHamsterIE(),
4641         HypemIE(),
4642         Vbox7IE(),
4643         GametrailersIE(),
4644         StatigrIE(),
4645         GenericIE()
4646     ]
4647
4648 def get_info_extractor(ie_name):
4649     """Returns the info extractor class with the given ie_name"""
4650     return globals()[ie_name+'IE']