youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 195         """
 196         Perform a regex search on the given string, using a single or a list of
 197         patterns returning the first matching group.
 198         In case of failure return a default value or raise a WARNING or a
 199         ExtractorError, depending on fatal, specifying the field name.
 200         """
 201         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 202             mobj = re.search(pattern, string, flags)
 203         else:
 204             for p in pattern:
 205                 mobj = re.search(p, string, flags)
 206                 if mobj: break
 207
 208         if sys.stderr.isatty() and os.name != 'nt':
 209             _name = u'\033[0;34m%s\033[0m' % name
 210         else:
 211             _name = name
 212
 213         if mobj:
 214             # return the first matching group
 215             return next(g for g in mobj.groups() if g is not None)
 216         elif default is not None:
 217             return default
 218         elif fatal:
 219             raise ExtractorError(u'Unable to extract %s' % _name)
 220         else:
 221             self._downloader.report_warning(u'unable to extract %s; '
 222                 u'please report this issue on GitHub.' % _name)
 223             return None
 224
 225     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 226         """
 227         Like _search_regex, but strips HTML tags and unescapes entities.
 228         """
 229         res = self._search_regex(pattern, string, name, default, fatal, flags)
 230         if res:
 231             return clean_html(res).strip()
 232         else:
 233             return res
 234
 235 class SearchInfoExtractor(InfoExtractor):
 236     """
 237     Base class for paged search queries extractors.
 238     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 239     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 240     """
 241
 242     @classmethod
 243     def _make_valid_url(cls):
 244         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 245
 246     @classmethod
 247     def suitable(cls, url):
 248         return re.match(cls._make_valid_url(), url) is not None
 249
 250     def _real_extract(self, query):
 251         mobj = re.match(self._make_valid_url(), query)
 252         if mobj is None:
 253             raise ExtractorError(u'Invalid search query "%s"' % query)
 254
 255         prefix = mobj.group('prefix')
 256         query = mobj.group('query')
 257         if prefix == '':
 258             return self._get_n_results(query, 1)
 259         elif prefix == 'all':
 260             return self._get_n_results(query, self._MAX_RESULTS)
 261         else:
 262             n = int(prefix)
 263             if n <= 0:
 264                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 265             elif n > self._MAX_RESULTS:
 266                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 267                 n = self._MAX_RESULTS
 268             return self._get_n_results(query, n)
 269
 270     def _get_n_results(self, query, n):
 271         """Get a specified number of results for a query"""
 272         raise NotImplementedError("This method must be implemented by sublclasses")
 273
 274
 275 class YoutubeIE(InfoExtractor):
 276     """Information extractor for youtube.com."""
 277
 278     _VALID_URL = r"""^
 279                      (
 280                          (?:https?://)?                                       # http(s):// (optional)
 281                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 282                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 283                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 284                          (?:                                                  # the various things that can precede the ID:
 285                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 286                              |(?:                                             # or the v= param in all its forms
 287                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 288                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 289                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 290                                  v=
 291                              )
 292                          )?                                                   # optional -> youtube.com/xxxx is OK
 293                      )?                                                       # all until now is optional -> you can pass the naked ID
 294                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 295                      (?(1).+)?                                                # if we found the ID, everything can follow
 296                      $"""
 297     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 298     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 299     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 300     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 301     _NETRC_MACHINE = 'youtube'
 302     # Listed in order of quality
 303     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 304     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 305     _video_extensions = {
 306         '13': '3gp',
 307         '17': 'mp4',
 308         '18': 'mp4',
 309         '22': 'mp4',
 310         '37': 'mp4',
 311         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 312         '43': 'webm',
 313         '44': 'webm',
 314         '45': 'webm',
 315         '46': 'webm',
 316     }
 317     _video_dimensions = {
 318         '5': '240x400',
 319         '6': '???',
 320         '13': '???',
 321         '17': '144x176',
 322         '18': '360x640',
 323         '22': '720x1280',
 324         '34': '360x640',
 325         '35': '480x854',
 326         '37': '1080x1920',
 327         '38': '3072x4096',
 328         '43': '360x640',
 329         '44': '480x854',
 330         '45': '720x1280',
 331         '46': '1080x1920',
 332     }
 333     IE_NAME = u'youtube'
 334
 335     @classmethod
 336     def suitable(cls, url):
 337         """Receives a URL and returns True if suitable for this IE."""
 338         if YoutubePlaylistIE.suitable(url): return False
 339         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 340
 341     def report_lang(self):
 342         """Report attempt to set language."""
 343         self.to_screen(u'Setting language')
 344
 345     def report_login(self):
 346         """Report attempt to log in."""
 347         self.to_screen(u'Logging in')
 348
 349     def report_video_webpage_download(self, video_id):
 350         """Report attempt to download video webpage."""
 351         self.to_screen(u'%s: Downloading video webpage' % video_id)
 352
 353     def report_video_info_webpage_download(self, video_id):
 354         """Report attempt to download video info webpage."""
 355         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 356
 357     def report_video_subtitles_download(self, video_id):
 358         """Report attempt to download video info webpage."""
 359         self.to_screen(u'%s: Checking available subtitles' % video_id)
 360
 361     def report_video_subtitles_request(self, video_id, sub_lang, format):
 362         """Report attempt to download video info webpage."""
 363         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 364
 365     def report_video_subtitles_available(self, video_id, sub_lang_list):
 366         """Report available subtitles."""
 367         sub_lang = ",".join(list(sub_lang_list.keys()))
 368         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 369
 370     def report_information_extraction(self, video_id):
 371         """Report attempt to extract video information."""
 372         self.to_screen(u'%s: Extracting video information' % video_id)
 373
 374     def report_unavailable_format(self, video_id, format):
 375         """Report extracted video URL."""
 376         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 377
 378     def report_rtmp_download(self):
 379         """Indicate the download will use the RTMP protocol."""
 380         self.to_screen(u'RTMP download detected')
 381
 382     def _get_available_subtitles(self, video_id):
 383         self.report_video_subtitles_download(video_id)
 384         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 385         try:
 386             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 388             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 389         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 390         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 391         if not sub_lang_list:
 392             return (u'video doesn\'t have subtitles', None)
 393         return sub_lang_list
 394
 395     def _list_available_subtitles(self, video_id):
 396         sub_lang_list = self._get_available_subtitles(video_id)
 397         self.report_video_subtitles_available(video_id, sub_lang_list)
 398
 399     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 400         """
 401         Return tuple:
 402         (error_message, sub_lang, sub)
 403         """
 404         self.report_video_subtitles_request(video_id, sub_lang, format)
 405         params = compat_urllib_parse.urlencode({
 406             'lang': sub_lang,
 407             'name': sub_name,
 408             'v': video_id,
 409             'fmt': format,
 410         })
 411         url = 'http://www.youtube.com/api/timedtext?' + params
 412         try:
 413             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 416         if not sub:
 417             return (u'Did not fetch video subtitles', None, None)
 418         return (None, sub_lang, sub)
 419
 420     def _request_automatic_caption(self, video_id, webpage):
 421         """We need the webpage for getting the captions url, pass it as an
 422            argument to speed up the process."""
 423         sub_lang = self._downloader.params.get('subtitleslang')
 424         sub_format = self._downloader.params.get('subtitlesformat')
 425         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 426         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 427         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 428         if mobj is None:
 429             return [(err_msg, None, None)]
 430         player_config = json.loads(mobj.group(1))
 431         try:
 432             args = player_config[u'args']
 433             caption_url = args[u'ttsurl']
 434             timestamp = args[u'timestamp']
 435             params = compat_urllib_parse.urlencode({
 436                 'lang': 'en',
 437                 'tlang': sub_lang,
 438                 'fmt': sub_format,
 439                 'ts': timestamp,
 440                 'kind': 'asr',
 441             })
 442             subtitles_url = caption_url + '&' + params
 443             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 444             return [(None, sub_lang, sub)]
 445         except KeyError:
 446             return [(err_msg, None, None)]
 447
 448     def _extract_subtitle(self, video_id):
 449         """
 450         Return a list with a tuple:
 451         [(error_message, sub_lang, sub)]
 452         """
 453         sub_lang_list = self._get_available_subtitles(video_id)
 454         sub_format = self._downloader.params.get('subtitlesformat')
 455         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 456             return [(sub_lang_list[0], None, None)]
 457         if self._downloader.params.get('subtitleslang', False):
 458             sub_lang = self._downloader.params.get('subtitleslang')
 459         elif 'en' in sub_lang_list:
 460             sub_lang = 'en'
 461         else:
 462             sub_lang = list(sub_lang_list.keys())[0]
 463         if not sub_lang in sub_lang_list:
 464             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 465
 466         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 467         return [subtitle]
 468
 469     def _extract_all_subtitles(self, video_id):
 470         sub_lang_list = self._get_available_subtitles(video_id)
 471         sub_format = self._downloader.params.get('subtitlesformat')
 472         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 473             return [(sub_lang_list[0], None, None)]
 474         subtitles = []
 475         for sub_lang in sub_lang_list:
 476             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 477             subtitles.append(subtitle)
 478         return subtitles
 479
 480     def _print_formats(self, formats):
 481         print('Available formats:')
 482         for x in formats:
 483             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 484
 485     def _real_initialize(self):
 486         if self._downloader is None:
 487             return
 488
 489         username = None
 490         password = None
 491         downloader_params = self._downloader.params
 492
 493         # Attempt to use provided username and password or .netrc data
 494         if downloader_params.get('username', None) is not None:
 495             username = downloader_params['username']
 496             password = downloader_params['password']
 497         elif downloader_params.get('usenetrc', False):
 498             try:
 499                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 500                 if info is not None:
 501                     username = info[0]
 502                     password = info[2]
 503                 else:
 504                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 505             except (IOError, netrc.NetrcParseError) as err:
 506                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 507                 return
 508
 509         # Set language
 510         request = compat_urllib_request.Request(self._LANG_URL)
 511         try:
 512             self.report_lang()
 513             compat_urllib_request.urlopen(request).read()
 514         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 515             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 516             return
 517
 518         # No authentication to be performed
 519         if username is None:
 520             return
 521
 522         request = compat_urllib_request.Request(self._LOGIN_URL)
 523         try:
 524             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 525         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 526             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 527             return
 528
 529         galx = None
 530         dsh = None
 531         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 532         if match:
 533           galx = match.group(1)
 534
 535         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 536         if match:
 537           dsh = match.group(1)
 538
 539         # Log in
 540         login_form_strs = {
 541                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 542                 u'Email': username,
 543                 u'GALX': galx,
 544                 u'Passwd': password,
 545                 u'PersistentCookie': u'yes',
 546                 u'_utf8': u'霱',
 547                 u'bgresponse': u'js_disabled',
 548                 u'checkConnection': u'',
 549                 u'checkedDomains': u'youtube',
 550                 u'dnConn': u'',
 551                 u'dsh': dsh,
 552                 u'pstMsg': u'0',
 553                 u'rmShown': u'1',
 554                 u'secTok': u'',
 555                 u'signIn': u'Sign in',
 556                 u'timeStmp': u'',
 557                 u'service': u'youtube',
 558                 u'uilel': u'3',
 559                 u'hl': u'en_US',
 560         }
 561         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 562         # chokes on unicode
 563         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 564         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 565         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 566         try:
 567             self.report_login()
 568             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 569             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 570                 self._downloader.report_warning(u'unable to log in: bad username or password')
 571                 return
 572         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 573             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 574             return
 575
 576         # Confirm age
 577         age_form = {
 578                 'next_url':     '/',
 579                 'action_confirm':   'Confirm',
 580                 }
 581         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 582         try:
 583             self.report_age_confirmation()
 584             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 585         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 586             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 587
 588     def _extract_id(self, url):
 589         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 590         if mobj is None:
 591             raise ExtractorError(u'Invalid URL: %s' % url)
 592         video_id = mobj.group(2)
 593         return video_id
 594
 595     def _real_extract(self, url):
 596         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 597         mobj = re.search(self._NEXT_URL_RE, url)
 598         if mobj:
 599             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 600         video_id = self._extract_id(url)
 601
 602         # Get video webpage
 603         self.report_video_webpage_download(video_id)
 604         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 605         request = compat_urllib_request.Request(url)
 606         try:
 607             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 608         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 609             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 610
 611         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 612
 613         # Attempt to extract SWF player URL
 614         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 615         if mobj is not None:
 616             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 617         else:
 618             player_url = None
 619
 620         # Get video info
 621         self.report_video_info_webpage_download(video_id)
 622         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 623             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 624                     % (video_id, el_type))
 625             video_info_webpage = self._download_webpage(video_info_url, video_id,
 626                                     note=False,
 627                                     errnote='unable to download video info webpage')
 628             video_info = compat_parse_qs(video_info_webpage)
 629             if 'token' in video_info:
 630                 break
 631         if 'token' not in video_info:
 632             if 'reason' in video_info:
 633                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 634             else:
 635                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 636
 637         # Check for "rental" videos
 638         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 639             raise ExtractorError(u'"rental" videos not supported')
 640
 641         # Start extracting information
 642         self.report_information_extraction(video_id)
 643
 644         # uploader
 645         if 'author' not in video_info:
 646             raise ExtractorError(u'Unable to extract uploader name')
 647         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 648
 649         # uploader_id
 650         video_uploader_id = None
 651         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 652         if mobj is not None:
 653             video_uploader_id = mobj.group(1)
 654         else:
 655             self._downloader.report_warning(u'unable to extract uploader nickname')
 656
 657         # title
 658         if 'title' not in video_info:
 659             raise ExtractorError(u'Unable to extract video title')
 660         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 661
 662         # thumbnail image
 663         if 'thumbnail_url' not in video_info:
 664             self._downloader.report_warning(u'unable to extract video thumbnail')
 665             video_thumbnail = ''
 666         else:   # don't panic if we can't find it
 667             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 668
 669         # upload date
 670         upload_date = None
 671         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 672         if mobj is not None:
 673             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 674             upload_date = unified_strdate(upload_date)
 675
 676         # description
 677         video_description = get_element_by_id("eow-description", video_webpage)
 678         if video_description:
 679             video_description = clean_html(video_description)
 680         else:
 681             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 682             if fd_mobj:
 683                 video_description = unescapeHTML(fd_mobj.group(1))
 684             else:
 685                 video_description = u''
 686
 687         # subtitles
 688         video_subtitles = None
 689
 690         if self._downloader.params.get('writesubtitles', False):
 691             video_subtitles = self._extract_subtitle(video_id)
 692             if video_subtitles:
 693                 (sub_error, sub_lang, sub) = video_subtitles[0]
 694                 if sub_error:
 695                     # We try with the automatic captions
 696                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 697                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 698                     if sub is not None:
 699                         pass
 700                     else:
 701                         # We report the original error
 702                         self._downloader.report_error(sub_error)
 703
 704         if self._downloader.params.get('allsubtitles', False):
 705             video_subtitles = self._extract_all_subtitles(video_id)
 706             for video_subtitle in video_subtitles:
 707                 (sub_error, sub_lang, sub) = video_subtitle
 708                 if sub_error:
 709                     self._downloader.report_error(sub_error)
 710
 711         if self._downloader.params.get('listsubtitles', False):
 712             sub_lang_list = self._list_available_subtitles(video_id)
 713             return
 714
 715         if 'length_seconds' not in video_info:
 716             self._downloader.report_warning(u'unable to extract video duration')
 717             video_duration = ''
 718         else:
 719             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 720
 721         # token
 722         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 723
 724         # Decide which formats to download
 725         req_format = self._downloader.params.get('format', None)
 726
 727         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 728             self.report_rtmp_download()
 729             video_url_list = [(None, video_info['conn'][0])]
 730         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 731             url_map = {}
 732             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 733                 url_data = compat_parse_qs(url_data_str)
 734                 if 'itag' in url_data and 'url' in url_data:
 735                     url = url_data['url'][0]
 736                     if 'sig' in url_data:
 737                         url += '&signature=' + url_data['sig'][0]
 738                     if 'ratebypass' not in url:
 739                         url += '&ratebypass=yes'
 740                     url_map[url_data['itag'][0]] = url
 741
 742             format_limit = self._downloader.params.get('format_limit', None)
 743             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 744             if format_limit is not None and format_limit in available_formats:
 745                 format_list = available_formats[available_formats.index(format_limit):]
 746             else:
 747                 format_list = available_formats
 748             existing_formats = [x for x in format_list if x in url_map]
 749             if len(existing_formats) == 0:
 750                 raise ExtractorError(u'no known formats available for video')
 751             if self._downloader.params.get('listformats', None):
 752                 self._print_formats(existing_formats)
 753                 return
 754             if req_format is None or req_format == 'best':
 755                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 756             elif req_format == 'worst':
 757                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 758             elif req_format in ('-1', 'all'):
 759                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 760             else:
 761                 # Specific formats. We pick the first in a slash-delimeted sequence.
 762                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 763                 req_formats = req_format.split('/')
 764                 video_url_list = None
 765                 for rf in req_formats:
 766                     if rf in url_map:
 767                         video_url_list = [(rf, url_map[rf])]
 768                         break
 769                 if video_url_list is None:
 770                     raise ExtractorError(u'requested format not available')
 771         else:
 772             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 773
 774         results = []
 775         for format_param, video_real_url in video_url_list:
 776             # Extension
 777             video_extension = self._video_extensions.get(format_param, 'flv')
 778
 779             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 780                                               self._video_dimensions.get(format_param, '???'))
 781
 782             results.append({
 783                 'id':       video_id,
 784                 'url':      video_real_url,
 785                 'uploader': video_uploader,
 786                 'uploader_id': video_uploader_id,
 787                 'upload_date':  upload_date,
 788                 'title':    video_title,
 789                 'ext':      video_extension,
 790                 'format':   video_format,
 791                 'thumbnail':    video_thumbnail,
 792                 'description':  video_description,
 793                 'player_url':   player_url,
 794                 'subtitles':    video_subtitles,
 795                 'duration':     video_duration
 796             })
 797         return results
 798
 799
 800 class MetacafeIE(InfoExtractor):
 801     """Information Extractor for metacafe.com."""
 802
 803     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 804     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 805     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 806     IE_NAME = u'metacafe'
 807
 808     def report_disclaimer(self):
 809         """Report disclaimer retrieval."""
 810         self.to_screen(u'Retrieving disclaimer')
 811
 812     def _real_initialize(self):
 813         # Retrieve disclaimer
 814         request = compat_urllib_request.Request(self._DISCLAIMER)
 815         try:
 816             self.report_disclaimer()
 817             disclaimer = compat_urllib_request.urlopen(request).read()
 818         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 819             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 820
 821         # Confirm age
 822         disclaimer_form = {
 823             'filters': '0',
 824             'submit': "Continue - I'm over 18",
 825             }
 826         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 827         try:
 828             self.report_age_confirmation()
 829             disclaimer = compat_urllib_request.urlopen(request).read()
 830         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 831             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 832
 833     def _real_extract(self, url):
 834         # Extract id and simplified title from URL
 835         mobj = re.match(self._VALID_URL, url)
 836         if mobj is None:
 837             raise ExtractorError(u'Invalid URL: %s' % url)
 838
 839         video_id = mobj.group(1)
 840
 841         # Check if video comes from YouTube
 842         mobj2 = re.match(r'^yt-(.*)$', video_id)
 843         if mobj2 is not None:
 844             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 845
 846         # Retrieve video webpage to extract further information
 847         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 848
 849         # Extract URL, uploader and title from webpage
 850         self.report_extraction(video_id)
 851         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 852         if mobj is not None:
 853             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 854             video_extension = mediaURL[-3:]
 855
 856             # Extract gdaKey if available
 857             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 858             if mobj is None:
 859                 video_url = mediaURL
 860             else:
 861                 gdaKey = mobj.group(1)
 862                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 863         else:
 864             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 865             if mobj is None:
 866                 raise ExtractorError(u'Unable to extract media URL')
 867             vardict = compat_parse_qs(mobj.group(1))
 868             if 'mediaData' not in vardict:
 869                 raise ExtractorError(u'Unable to extract media URL')
 870             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 871             if mobj is None:
 872                 raise ExtractorError(u'Unable to extract media URL')
 873             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 874             video_extension = mediaURL[-3:]
 875             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 876
 877         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 878         if mobj is None:
 879             raise ExtractorError(u'Unable to extract title')
 880         video_title = mobj.group(1).decode('utf-8')
 881
 882         mobj = re.search(r'submitter=(.*?);', webpage)
 883         if mobj is None:
 884             raise ExtractorError(u'Unable to extract uploader nickname')
 885         video_uploader = mobj.group(1)
 886
 887         return [{
 888             'id':       video_id.decode('utf-8'),
 889             'url':      video_url.decode('utf-8'),
 890             'uploader': video_uploader.decode('utf-8'),
 891             'upload_date':  None,
 892             'title':    video_title,
 893             'ext':      video_extension.decode('utf-8'),
 894         }]
 895
 896 class DailymotionIE(InfoExtractor):
 897     """Information Extractor for Dailymotion"""
 898
 899     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 900     IE_NAME = u'dailymotion'
 901
 902     def _real_extract(self, url):
 903         # Extract id and simplified title from URL
 904         mobj = re.match(self._VALID_URL, url)
 905         if mobj is None:
 906             raise ExtractorError(u'Invalid URL: %s' % url)
 907
 908         video_id = mobj.group(1).split('_')[0].split('?')[0]
 909
 910         video_extension = 'mp4'
 911
 912         # Retrieve video webpage to extract further information
 913         request = compat_urllib_request.Request(url)
 914         request.add_header('Cookie', 'family_filter=off')
 915         webpage = self._download_webpage(request, video_id)
 916
 917         # Extract URL, uploader and title from webpage
 918         self.report_extraction(video_id)
 919         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 920         if mobj is None:
 921             raise ExtractorError(u'Unable to extract media URL')
 922         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 923
 924         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 925             if key in flashvars:
 926                 max_quality = key
 927                 self.to_screen(u'Using %s' % key)
 928                 break
 929         else:
 930             raise ExtractorError(u'Unable to extract video URL')
 931
 932         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 933         if mobj is None:
 934             raise ExtractorError(u'Unable to extract video URL')
 935
 936         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 937
 938         # TODO: support choosing qualities
 939
 940         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 941         if mobj is None:
 942             raise ExtractorError(u'Unable to extract title')
 943         video_title = unescapeHTML(mobj.group('title'))
 944
 945         video_uploader = None
 946         video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
 947                                              # Looking for official user
 948                                              r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
 949                                             webpage, 'video uploader')
 950
 951         video_upload_date = None
 952         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 953         if mobj is not None:
 954             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 955
 956         return [{
 957             'id':       video_id,
 958             'url':      video_url,
 959             'uploader': video_uploader,
 960             'upload_date':  video_upload_date,
 961             'title':    video_title,
 962             'ext':      video_extension,
 963         }]
 964
 965
 966 class PhotobucketIE(InfoExtractor):
 967     """Information extractor for photobucket.com."""
 968
 969     # TODO: the original _VALID_URL was:
 970     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 971     # Check if it's necessary to keep the old extracion process
 972     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 973     IE_NAME = u'photobucket'
 974
 975     def _real_extract(self, url):
 976         # Extract id from URL
 977         mobj = re.match(self._VALID_URL, url)
 978         if mobj is None:
 979             raise ExtractorError(u'Invalid URL: %s' % url)
 980
 981         video_id = mobj.group('id')
 982
 983         video_extension = mobj.group('ext')
 984
 985         # Retrieve video webpage to extract further information
 986         webpage = self._download_webpage(url, video_id)
 987
 988         # Extract URL, uploader, and title from webpage
 989         self.report_extraction(video_id)
 990         # We try first by looking the javascript code:
 991         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 992         if mobj is not None:
 993             info = json.loads(mobj.group('json'))
 994             return [{
 995                 'id':       video_id,
 996                 'url':      info[u'downloadUrl'],
 997                 'uploader': info[u'username'],
 998                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 999                 'title':    info[u'title'],
1000                 'ext':      video_extension,
1001                 'thumbnail': info[u'thumbUrl'],
1002             }]
1003
1004         # We try looking in other parts of the webpage
1005         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1006             webpage, u'video URL')
1007
1008         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1009         if mobj is None:
1010             raise ExtractorError(u'Unable to extract title')
1011         video_title = mobj.group(1).decode('utf-8')
1012         video_uploader = mobj.group(2).decode('utf-8')
1013
1014         return [{
1015             'id':       video_id.decode('utf-8'),
1016             'url':      video_url.decode('utf-8'),
1017             'uploader': video_uploader,
1018             'upload_date':  None,
1019             'title':    video_title,
1020             'ext':      video_extension.decode('utf-8'),
1021         }]
1022
1023
1024 class YahooIE(InfoExtractor):
1025     """Information extractor for screen.yahoo.com."""
1026     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1027
1028     def _real_extract(self, url):
1029         mobj = re.match(self._VALID_URL, url)
1030         if mobj is None:
1031             raise ExtractorError(u'Invalid URL: %s' % url)
1032         video_id = mobj.group('id')
1033         webpage = self._download_webpage(url, video_id)
1034         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1035
1036         if m_id is None:
1037             # TODO: Check which url parameters are required
1038             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1039             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1040             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1041                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1042                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1043                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1044                         '''
1045             self.report_extraction(video_id)
1046             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1047             if m_info is None:
1048                 raise ExtractorError(u'Unable to extract video info')
1049             video_title = m_info.group('title')
1050             video_description = m_info.group('description')
1051             video_thumb = m_info.group('thumb')
1052             video_date = m_info.group('date')
1053             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1054
1055             # TODO: Find a way to get mp4 videos
1056             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1057             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1058             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1059             video_url = m_rest.group('url')
1060             video_path = m_rest.group('path')
1061             if m_rest is None:
1062                 raise ExtractorError(u'Unable to extract video url')
1063
1064         else: # We have to use a different method if another id is defined
1065             long_id = m_id.group('new_id')
1066             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1067             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1068             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1069             info = json.loads(json_str)
1070             res = info[u'query'][u'results'][u'mediaObj'][0]
1071             stream = res[u'streams'][0]
1072             video_path = stream[u'path']
1073             video_url = stream[u'host']
1074             meta = res[u'meta']
1075             video_title = meta[u'title']
1076             video_description = meta[u'description']
1077             video_thumb = meta[u'thumbnail']
1078             video_date = None # I can't find it
1079
1080         info_dict = {
1081                      'id': video_id,
1082                      'url': video_url,
1083                      'play_path': video_path,
1084                      'title':video_title,
1085                      'description': video_description,
1086                      'thumbnail': video_thumb,
1087                      'upload_date': video_date,
1088                      'ext': 'flv',
1089                      }
1090         return info_dict
1091
1092 class VimeoIE(InfoExtractor):
1093     """Information extractor for vimeo.com."""
1094
1095     # _VALID_URL matches Vimeo URLs
1096     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1097     IE_NAME = u'vimeo'
1098
1099     def _real_extract(self, url, new_video=True):
1100         # Extract ID from URL
1101         mobj = re.match(self._VALID_URL, url)
1102         if mobj is None:
1103             raise ExtractorError(u'Invalid URL: %s' % url)
1104
1105         video_id = mobj.group('id')
1106         if not mobj.group('proto'):
1107             url = 'https://' + url
1108         if mobj.group('direct_link') or mobj.group('pro'):
1109             url = 'https://vimeo.com/' + video_id
1110
1111         # Retrieve video webpage to extract further information
1112         request = compat_urllib_request.Request(url, None, std_headers)
1113         webpage = self._download_webpage(request, video_id)
1114
1115         # Now we begin extracting as much information as we can from what we
1116         # retrieved. First we extract the information common to all extractors,
1117         # and latter we extract those that are Vimeo specific.
1118         self.report_extraction(video_id)
1119
1120         # Extract the config JSON
1121         try:
1122             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1123             config = json.loads(config)
1124         except:
1125             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1126                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1127             else:
1128                 raise ExtractorError(u'Unable to extract info section')
1129
1130         # Extract title
1131         video_title = config["video"]["title"]
1132
1133         # Extract uploader and uploader_id
1134         video_uploader = config["video"]["owner"]["name"]
1135         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1136
1137         # Extract video thumbnail
1138         video_thumbnail = config["video"]["thumbnail"]
1139
1140         # Extract video description
1141         video_description = get_element_by_attribute("itemprop", "description", webpage)
1142         if video_description: video_description = clean_html(video_description)
1143         else: video_description = u''
1144
1145         # Extract upload date
1146         video_upload_date = None
1147         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1148         if mobj is not None:
1149             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1150
1151         # Vimeo specific: extract request signature and timestamp
1152         sig = config['request']['signature']
1153         timestamp = config['request']['timestamp']
1154
1155         # Vimeo specific: extract video codec and quality information
1156         # First consider quality, then codecs, then take everything
1157         # TODO bind to format param
1158         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1159         files = { 'hd': [], 'sd': [], 'other': []}
1160         for codec_name, codec_extension in codecs:
1161             if codec_name in config["video"]["files"]:
1162                 if 'hd' in config["video"]["files"][codec_name]:
1163                     files['hd'].append((codec_name, codec_extension, 'hd'))
1164                 elif 'sd' in config["video"]["files"][codec_name]:
1165                     files['sd'].append((codec_name, codec_extension, 'sd'))
1166                 else:
1167                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1168
1169         for quality in ('hd', 'sd', 'other'):
1170             if len(files[quality]) > 0:
1171                 video_quality = files[quality][0][2]
1172                 video_codec = files[quality][0][0]
1173                 video_extension = files[quality][0][1]
1174                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1175                 break
1176         else:
1177             raise ExtractorError(u'No known codec found')
1178
1179         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1180                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1181
1182         return [{
1183             'id':       video_id,
1184             'url':      video_url,
1185             'uploader': video_uploader,
1186             'uploader_id': video_uploader_id,
1187             'upload_date':  video_upload_date,
1188             'title':    video_title,
1189             'ext':      video_extension,
1190             'thumbnail':    video_thumbnail,
1191             'description':  video_description,
1192         }]
1193
1194
1195 class ArteTvIE(InfoExtractor):
1196     """arte.tv information extractor."""
1197
1198     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1199     _LIVE_URL = r'index-[0-9]+\.html$'
1200
1201     IE_NAME = u'arte.tv'
1202
1203     def fetch_webpage(self, url):
1204         request = compat_urllib_request.Request(url)
1205         try:
1206             self.report_download_webpage(url)
1207             webpage = compat_urllib_request.urlopen(request).read()
1208         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1209             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1210         except ValueError as err:
1211             raise ExtractorError(u'Invalid URL: %s' % url)
1212         return webpage
1213
1214     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1215         page = self.fetch_webpage(url)
1216         mobj = re.search(regex, page, regexFlags)
1217         info = {}
1218
1219         if mobj is None:
1220             raise ExtractorError(u'Invalid URL: %s' % url)
1221
1222         for (i, key, err) in matchTuples:
1223             if mobj.group(i) is None:
1224                 raise ExtractorError(err)
1225             else:
1226                 info[key] = mobj.group(i)
1227
1228         return info
1229
1230     def extractLiveStream(self, url):
1231         video_lang = url.split('/')[-4]
1232         info = self.grep_webpage(
1233             url,
1234             r'src="(.*?/videothek_js.*?\.js)',
1235             0,
1236             [
1237                 (1, 'url', u'Invalid URL: %s' % url)
1238             ]
1239         )
1240         http_host = url.split('/')[2]
1241         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1242         info = self.grep_webpage(
1243             next_url,
1244             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1245                 '(http://.*?\.swf).*?' +
1246                 '(rtmp://.*?)\'',
1247             re.DOTALL,
1248             [
1249                 (1, 'path',   u'could not extract video path: %s' % url),
1250                 (2, 'player', u'could not extract video player: %s' % url),
1251                 (3, 'url',    u'could not extract video url: %s' % url)
1252             ]
1253         )
1254         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1255
1256     def extractPlus7Stream(self, url):
1257         video_lang = url.split('/')[-3]
1258         info = self.grep_webpage(
1259             url,
1260             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1261             0,
1262             [
1263                 (1, 'url', u'Invalid URL: %s' % url)
1264             ]
1265         )
1266         next_url = compat_urllib_parse.unquote(info.get('url'))
1267         info = self.grep_webpage(
1268             next_url,
1269             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1270             0,
1271             [
1272                 (1, 'url', u'Could not find <video> tag: %s' % url)
1273             ]
1274         )
1275         next_url = compat_urllib_parse.unquote(info.get('url'))
1276
1277         info = self.grep_webpage(
1278             next_url,
1279             r'<video id="(.*?)".*?>.*?' +
1280                 '<name>(.*?)</name>.*?' +
1281                 '<dateVideo>(.*?)</dateVideo>.*?' +
1282                 '<url quality="hd">(.*?)</url>',
1283             re.DOTALL,
1284             [
1285                 (1, 'id',    u'could not extract video id: %s' % url),
1286                 (2, 'title', u'could not extract video title: %s' % url),
1287                 (3, 'date',  u'could not extract video date: %s' % url),
1288                 (4, 'url',   u'could not extract video url: %s' % url)
1289             ]
1290         )
1291
1292         return {
1293             'id':           info.get('id'),
1294             'url':          compat_urllib_parse.unquote(info.get('url')),
1295             'uploader':     u'arte.tv',
1296             'upload_date':  unified_strdate(info.get('date')),
1297             'title':        info.get('title').decode('utf-8'),
1298             'ext':          u'mp4',
1299             'format':       u'NA',
1300             'player_url':   None,
1301         }
1302
1303     def _real_extract(self, url):
1304         video_id = url.split('/')[-1]
1305         self.report_extraction(video_id)
1306
1307         if re.search(self._LIVE_URL, video_id) is not None:
1308             self.extractLiveStream(url)
1309             return
1310         else:
1311             info = self.extractPlus7Stream(url)
1312
1313         return [info]
1314
1315
1316 class GenericIE(InfoExtractor):
1317     """Generic last-resort information extractor."""
1318
1319     _VALID_URL = r'.*'
1320     IE_NAME = u'generic'
1321
1322     def report_download_webpage(self, video_id):
1323         """Report webpage download."""
1324         if not self._downloader.params.get('test', False):
1325             self._downloader.report_warning(u'Falling back on generic information extractor.')
1326         super(GenericIE, self).report_download_webpage(video_id)
1327
1328     def report_following_redirect(self, new_url):
1329         """Report information extraction."""
1330         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1331
1332     def _test_redirect(self, url):
1333         """Check if it is a redirect, like url shorteners, in case return the new url."""
1334         class HeadRequest(compat_urllib_request.Request):
1335             def get_method(self):
1336                 return "HEAD"
1337
1338         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1339             """
1340             Subclass the HTTPRedirectHandler to make it use our
1341             HeadRequest also on the redirected URL
1342             """
1343             def redirect_request(self, req, fp, code, msg, headers, newurl):
1344                 if code in (301, 302, 303, 307):
1345                     newurl = newurl.replace(' ', '%20')
1346                     newheaders = dict((k,v) for k,v in req.headers.items()
1347                                       if k.lower() not in ("content-length", "content-type"))
1348                     return HeadRequest(newurl,
1349                                        headers=newheaders,
1350                                        origin_req_host=req.get_origin_req_host(),
1351                                        unverifiable=True)
1352                 else:
1353                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1354
1355         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1356             """
1357             Fallback to GET if HEAD is not allowed (405 HTTP error)
1358             """
1359             def http_error_405(self, req, fp, code, msg, headers):
1360                 fp.read()
1361                 fp.close()
1362
1363                 newheaders = dict((k,v) for k,v in req.headers.items()
1364                                   if k.lower() not in ("content-length", "content-type"))
1365                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1366                                                  headers=newheaders,
1367                                                  origin_req_host=req.get_origin_req_host(),
1368                                                  unverifiable=True))
1369
1370         # Build our opener
1371         opener = compat_urllib_request.OpenerDirector()
1372         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1373                         HTTPMethodFallback, HEADRedirectHandler,
1374                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1375             opener.add_handler(handler())
1376
1377         response = opener.open(HeadRequest(url))
1378         if response is None:
1379             raise ExtractorError(u'Invalid URL protocol')
1380         new_url = response.geturl()
1381
1382         if url == new_url:
1383             return False
1384
1385         self.report_following_redirect(new_url)
1386         return new_url
1387
1388     def _real_extract(self, url):
1389         new_url = self._test_redirect(url)
1390         if new_url: return [self.url_result(new_url)]
1391
1392         video_id = url.split('/')[-1]
1393         try:
1394             webpage = self._download_webpage(url, video_id)
1395         except ValueError as err:
1396             # since this is the last-resort InfoExtractor, if
1397             # this error is thrown, it'll be thrown here
1398             raise ExtractorError(u'Invalid URL: %s' % url)
1399
1400         self.report_extraction(video_id)
1401         # Start with something easy: JW Player in SWFObject
1402         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1403         if mobj is None:
1404             # Broaden the search a little bit
1405             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1406         if mobj is None:
1407             # Broaden the search a little bit: JWPlayer JS loader
1408             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1409         if mobj is None:
1410             # Try to find twitter cards info
1411             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1412         if mobj is None:
1413             raise ExtractorError(u'Invalid URL: %s' % url)
1414
1415         # It's possible that one of the regexes
1416         # matched, but returned an empty group:
1417         if mobj.group(1) is None:
1418             raise ExtractorError(u'Invalid URL: %s' % url)
1419
1420         video_url = compat_urllib_parse.unquote(mobj.group(1))
1421         video_id = os.path.basename(video_url)
1422
1423         # here's a fun little line of code for you:
1424         video_extension = os.path.splitext(video_id)[1][1:]
1425         video_id = os.path.splitext(video_id)[0]
1426
1427         # it's tempting to parse this further, but you would
1428         # have to take into account all the variations like
1429         #   Video Title - Site Name
1430         #   Site Name | Video Title
1431         #   Video Title - Tagline | Site Name
1432         # and so on and so forth; it's just not practical
1433         video_title = self._html_search_regex(r'<title>(.*)</title>',
1434             webpage, u'video title')
1435
1436         # video uploader is domain name
1437         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1438             url, u'video uploader')
1439
1440         return [{
1441             'id':       video_id,
1442             'url':      video_url,
1443             'uploader': video_uploader,
1444             'upload_date':  None,
1445             'title':    video_title,
1446             'ext':      video_extension,
1447         }]
1448
1449
1450 class YoutubeSearchIE(SearchInfoExtractor):
1451     """Information Extractor for YouTube search queries."""
1452     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1453     _MAX_RESULTS = 1000
1454     IE_NAME = u'youtube:search'
1455     _SEARCH_KEY = 'ytsearch'
1456
1457     def report_download_page(self, query, pagenum):
1458         """Report attempt to download search page with given number."""
1459         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1460
1461     def _get_n_results(self, query, n):
1462         """Get a specified number of results for a query"""
1463
1464         video_ids = []
1465         pagenum = 0
1466         limit = n
1467
1468         while (50 * pagenum) < limit:
1469             self.report_download_page(query, pagenum+1)
1470             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1471             request = compat_urllib_request.Request(result_url)
1472             try:
1473                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1474             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1475                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1476             api_response = json.loads(data)['data']
1477
1478             if not 'items' in api_response:
1479                 raise ExtractorError(u'[youtube] No video results')
1480
1481             new_ids = list(video['id'] for video in api_response['items'])
1482             video_ids += new_ids
1483
1484             limit = min(n, api_response['totalItems'])
1485             pagenum += 1
1486
1487         if len(video_ids) > n:
1488             video_ids = video_ids[:n]
1489         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1490         return self.playlist_result(videos, query)
1491
1492
1493 class GoogleSearchIE(SearchInfoExtractor):
1494     """Information Extractor for Google Video search queries."""
1495     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1496     _MAX_RESULTS = 1000
1497     IE_NAME = u'video.google:search'
1498     _SEARCH_KEY = 'gvsearch'
1499
1500     def _get_n_results(self, query, n):
1501         """Get a specified number of results for a query"""
1502
1503         res = {
1504             '_type': 'playlist',
1505             'id': query,
1506             'entries': []
1507         }
1508
1509         for pagenum in itertools.count(1):
1510             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1511             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1512                                              note='Downloading result page ' + str(pagenum))
1513
1514             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1515                 e = {
1516                     '_type': 'url',
1517                     'url': mobj.group(1)
1518                 }
1519                 res['entries'].append(e)
1520
1521             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1522                 return res
1523
1524 class YahooSearchIE(SearchInfoExtractor):
1525     """Information Extractor for Yahoo! Video search queries."""
1526
1527     _MAX_RESULTS = 1000
1528     IE_NAME = u'screen.yahoo:search'
1529     _SEARCH_KEY = 'yvsearch'
1530
1531     def _get_n_results(self, query, n):
1532         """Get a specified number of results for a query"""
1533
1534         res = {
1535             '_type': 'playlist',
1536             'id': query,
1537             'entries': []
1538         }
1539         for pagenum in itertools.count(0):
1540             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1541             webpage = self._download_webpage(result_url, query,
1542                                              note='Downloading results page '+str(pagenum+1))
1543             info = json.loads(webpage)
1544             m = info[u'm']
1545             results = info[u'results']
1546
1547             for (i, r) in enumerate(results):
1548                 if (pagenum * 30) +i >= n:
1549                     break
1550                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1551                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1552                 res['entries'].append(e)
1553             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1554                 break
1555
1556         return res
1557
1558
1559 class YoutubePlaylistIE(InfoExtractor):
1560     """Information Extractor for YouTube playlists."""
1561
1562     _VALID_URL = r"""(?:
1563                         (?:https?://)?
1564                         (?:\w+\.)?
1565                         youtube\.com/
1566                         (?:
1567                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1568                            \? (?:.*?&)*? (?:p|a|list)=
1569                         |  p/
1570                         )
1571                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1572                         .*
1573                      |
1574                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1575                      )"""
1576     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1577     _MAX_RESULTS = 50
1578     IE_NAME = u'youtube:playlist'
1579
1580     @classmethod
1581     def suitable(cls, url):
1582         """Receives a URL and returns True if suitable for this IE."""
1583         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1584
1585     def _real_extract(self, url):
1586         # Extract playlist id
1587         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1588         if mobj is None:
1589             raise ExtractorError(u'Invalid URL: %s' % url)
1590
1591         # Download playlist videos from API
1592         playlist_id = mobj.group(1) or mobj.group(2)
1593         page_num = 1
1594         videos = []
1595
1596         while True:
1597             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1598             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1599
1600             try:
1601                 response = json.loads(page)
1602             except ValueError as err:
1603                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1604
1605             if 'feed' not in response:
1606                 raise ExtractorError(u'Got a malformed response from YouTube API')
1607             playlist_title = response['feed']['title']['$t']
1608             if 'entry' not in response['feed']:
1609                 # Number of videos is a multiple of self._MAX_RESULTS
1610                 break
1611
1612             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1613                         for entry in response['feed']['entry']
1614                         if 'content' in entry ]
1615
1616             if len(response['feed']['entry']) < self._MAX_RESULTS:
1617                 break
1618             page_num += 1
1619
1620         videos = [v[1] for v in sorted(videos)]
1621
1622         url_results = [self.url_result(url, 'Youtube') for url in videos]
1623         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1624
1625
1626 class YoutubeChannelIE(InfoExtractor):
1627     """Information Extractor for YouTube channels."""
1628
1629     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1630     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1631     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1632     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1633     IE_NAME = u'youtube:channel'
1634
1635     def extract_videos_from_page(self, page):
1636         ids_in_page = []
1637         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1638             if mobj.group(1) not in ids_in_page:
1639                 ids_in_page.append(mobj.group(1))
1640         return ids_in_page
1641
1642     def _real_extract(self, url):
1643         # Extract channel id
1644         mobj = re.match(self._VALID_URL, url)
1645         if mobj is None:
1646             raise ExtractorError(u'Invalid URL: %s' % url)
1647
1648         # Download channel page
1649         channel_id = mobj.group(1)
1650         video_ids = []
1651         pagenum = 1
1652
1653         url = self._TEMPLATE_URL % (channel_id, pagenum)
1654         page = self._download_webpage(url, channel_id,
1655                                       u'Downloading page #%s' % pagenum)
1656
1657         # Extract video identifiers
1658         ids_in_page = self.extract_videos_from_page(page)
1659         video_ids.extend(ids_in_page)
1660
1661         # Download any subsequent channel pages using the json-based channel_ajax query
1662         if self._MORE_PAGES_INDICATOR in page:
1663             while True:
1664                 pagenum = pagenum + 1
1665
1666                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1667                 page = self._download_webpage(url, channel_id,
1668                                               u'Downloading page #%s' % pagenum)
1669
1670                 page = json.loads(page)
1671
1672                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1673                 video_ids.extend(ids_in_page)
1674
1675                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1676                     break
1677
1678         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1679
1680         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1681         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1682         return [self.playlist_result(url_entries, channel_id)]
1683
1684
1685 class YoutubeUserIE(InfoExtractor):
1686     """Information Extractor for YouTube users."""
1687
1688     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1689     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1690     _GDATA_PAGE_SIZE = 50
1691     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1692     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1693     IE_NAME = u'youtube:user'
1694
1695     def _real_extract(self, url):
1696         # Extract username
1697         mobj = re.match(self._VALID_URL, url)
1698         if mobj is None:
1699             raise ExtractorError(u'Invalid URL: %s' % url)
1700
1701         username = mobj.group(1)
1702
1703         # Download video ids using YouTube Data API. Result size per
1704         # query is limited (currently to 50 videos) so we need to query
1705         # page by page until there are no video ids - it means we got
1706         # all of them.
1707
1708         video_ids = []
1709         pagenum = 0
1710
1711         while True:
1712             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1713
1714             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1715             page = self._download_webpage(gdata_url, username,
1716                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1717
1718             # Extract video identifiers
1719             ids_in_page = []
1720
1721             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1722                 if mobj.group(1) not in ids_in_page:
1723                     ids_in_page.append(mobj.group(1))
1724
1725             video_ids.extend(ids_in_page)
1726
1727             # A little optimization - if current page is not
1728             # "full", ie. does not contain PAGE_SIZE video ids then
1729             # we can assume that this page is the last one - there
1730             # are no more ids on further pages - no need to query
1731             # again.
1732
1733             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1734                 break
1735
1736             pagenum += 1
1737
1738         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1739         url_results = [self.url_result(url, 'Youtube') for url in urls]
1740         return [self.playlist_result(url_results, playlist_title = username)]
1741
1742
1743 class BlipTVUserIE(InfoExtractor):
1744     """Information Extractor for blip.tv users."""
1745
1746     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1747     _PAGE_SIZE = 12
1748     IE_NAME = u'blip.tv:user'
1749
1750     def _real_extract(self, url):
1751         # Extract username
1752         mobj = re.match(self._VALID_URL, url)
1753         if mobj is None:
1754             raise ExtractorError(u'Invalid URL: %s' % url)
1755
1756         username = mobj.group(1)
1757
1758         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1759
1760         page = self._download_webpage(url, username, u'Downloading user page')
1761         mobj = re.search(r'data-users-id="([^"]+)"', page)
1762         page_base = page_base % mobj.group(1)
1763
1764
1765         # Download video ids using BlipTV Ajax calls. Result size per
1766         # query is limited (currently to 12 videos) so we need to query
1767         # page by page until there are no video ids - it means we got
1768         # all of them.
1769
1770         video_ids = []
1771         pagenum = 1
1772
1773         while True:
1774             url = page_base + "&page=" + str(pagenum)
1775             page = self._download_webpage(url, username,
1776                                           u'Downloading video ids from page %d' % pagenum)
1777
1778             # Extract video identifiers
1779             ids_in_page = []
1780
1781             for mobj in re.finditer(r'href="/([^"]+)"', page):
1782                 if mobj.group(1) not in ids_in_page:
1783                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1784
1785             video_ids.extend(ids_in_page)
1786
1787             # A little optimization - if current page is not
1788             # "full", ie. does not contain PAGE_SIZE video ids then
1789             # we can assume that this page is the last one - there
1790             # are no more ids on further pages - no need to query
1791             # again.
1792
1793             if len(ids_in_page) < self._PAGE_SIZE:
1794                 break
1795
1796             pagenum += 1
1797
1798         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1799         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1800         return [self.playlist_result(url_entries, playlist_title = username)]
1801
1802
1803 class DepositFilesIE(InfoExtractor):
1804     """Information extractor for depositfiles.com"""
1805
1806     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1807
1808     def _real_extract(self, url):
1809         file_id = url.split('/')[-1]
1810         # Rebuild url in english locale
1811         url = 'http://depositfiles.com/en/files/' + file_id
1812
1813         # Retrieve file webpage with 'Free download' button pressed
1814         free_download_indication = { 'gateway_result' : '1' }
1815         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1816         try:
1817             self.report_download_webpage(file_id)
1818             webpage = compat_urllib_request.urlopen(request).read()
1819         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1820             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1821
1822         # Search for the real file URL
1823         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1824         if (mobj is None) or (mobj.group(1) is None):
1825             # Try to figure out reason of the error.
1826             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1827             if (mobj is not None) and (mobj.group(1) is not None):
1828                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1829                 raise ExtractorError(u'%s' % restriction_message)
1830             else:
1831                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1832
1833         file_url = mobj.group(1)
1834         file_extension = os.path.splitext(file_url)[1][1:]
1835
1836         # Search for file title
1837         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1838
1839         return [{
1840             'id':       file_id.decode('utf-8'),
1841             'url':      file_url.decode('utf-8'),
1842             'uploader': None,
1843             'upload_date':  None,
1844             'title':    file_title,
1845             'ext':      file_extension.decode('utf-8'),
1846         }]
1847
1848
1849 class FacebookIE(InfoExtractor):
1850     """Information Extractor for Facebook"""
1851
1852     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1853     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1854     _NETRC_MACHINE = 'facebook'
1855     IE_NAME = u'facebook'
1856
1857     def report_login(self):
1858         """Report attempt to log in."""
1859         self.to_screen(u'Logging in')
1860
1861     def _real_initialize(self):
1862         if self._downloader is None:
1863             return
1864
1865         useremail = None
1866         password = None
1867         downloader_params = self._downloader.params
1868
1869         # Attempt to use provided username and password or .netrc data
1870         if downloader_params.get('username', None) is not None:
1871             useremail = downloader_params['username']
1872             password = downloader_params['password']
1873         elif downloader_params.get('usenetrc', False):
1874             try:
1875                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1876                 if info is not None:
1877                     useremail = info[0]
1878                     password = info[2]
1879                 else:
1880                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1881             except (IOError, netrc.NetrcParseError) as err:
1882                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1883                 return
1884
1885         if useremail is None:
1886             return
1887
1888         # Log in
1889         login_form = {
1890             'email': useremail,
1891             'pass': password,
1892             'login': 'Log+In'
1893             }
1894         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1895         try:
1896             self.report_login()
1897             login_results = compat_urllib_request.urlopen(request).read()
1898             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1899                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1900                 return
1901         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1902             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1903             return
1904
1905     def _real_extract(self, url):
1906         mobj = re.match(self._VALID_URL, url)
1907         if mobj is None:
1908             raise ExtractorError(u'Invalid URL: %s' % url)
1909         video_id = mobj.group('ID')
1910
1911         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1912         webpage = self._download_webpage(url, video_id)
1913
1914         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1915         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1916         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1917         if not m:
1918             raise ExtractorError(u'Cannot parse data')
1919         data = dict(json.loads(m.group(1)))
1920         params_raw = compat_urllib_parse.unquote(data['params'])
1921         params = json.loads(params_raw)
1922         video_data = params['video_data'][0]
1923         video_url = video_data.get('hd_src')
1924         if not video_url:
1925             video_url = video_data['sd_src']
1926         if not video_url:
1927             raise ExtractorError(u'Cannot find video URL')
1928         video_duration = int(video_data['video_duration'])
1929         thumbnail = video_data['thumbnail_src']
1930
1931         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1932             webpage, u'title')
1933
1934         info = {
1935             'id': video_id,
1936             'title': video_title,
1937             'url': video_url,
1938             'ext': 'mp4',
1939             'duration': video_duration,
1940             'thumbnail': thumbnail,
1941         }
1942         return [info]
1943
1944
1945 class BlipTVIE(InfoExtractor):
1946     """Information extractor for blip.tv"""
1947
1948     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1949     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1950     IE_NAME = u'blip.tv'
1951
1952     def report_direct_download(self, title):
1953         """Report information extraction."""
1954         self.to_screen(u'%s: Direct download detected' % title)
1955
1956     def _real_extract(self, url):
1957         mobj = re.match(self._VALID_URL, url)
1958         if mobj is None:
1959             raise ExtractorError(u'Invalid URL: %s' % url)
1960
1961         # See https://github.com/rg3/youtube-dl/issues/857
1962         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1963         if api_mobj is not None:
1964             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1965         urlp = compat_urllib_parse_urlparse(url)
1966         if urlp.path.startswith('/play/'):
1967             request = compat_urllib_request.Request(url)
1968             response = compat_urllib_request.urlopen(request)
1969             redirecturl = response.geturl()
1970             rurlp = compat_urllib_parse_urlparse(redirecturl)
1971             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1972             url = 'http://blip.tv/a/a-' + file_id
1973             return self._real_extract(url)
1974
1975
1976         if '?' in url:
1977             cchar = '&'
1978         else:
1979             cchar = '?'
1980         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1981         request = compat_urllib_request.Request(json_url)
1982         request.add_header('User-Agent', 'iTunes/10.6.1')
1983         self.report_extraction(mobj.group(1))
1984         info = None
1985         try:
1986             urlh = compat_urllib_request.urlopen(request)
1987             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1988                 basename = url.split('/')[-1]
1989                 title,ext = os.path.splitext(basename)
1990                 title = title.decode('UTF-8')
1991                 ext = ext.replace('.', '')
1992                 self.report_direct_download(title)
1993                 info = {
1994                     'id': title,
1995                     'url': url,
1996                     'uploader': None,
1997                     'upload_date': None,
1998                     'title': title,
1999                     'ext': ext,
2000                     'urlhandle': urlh
2001                 }
2002         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2003             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2004         if info is None: # Regular URL
2005             try:
2006                 json_code_bytes = urlh.read()
2007                 json_code = json_code_bytes.decode('utf-8')
2008             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2009                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2010
2011             try:
2012                 json_data = json.loads(json_code)
2013                 if 'Post' in json_data:
2014                     data = json_data['Post']
2015                 else:
2016                     data = json_data
2017
2018                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2019                 video_url = data['media']['url']
2020                 umobj = re.match(self._URL_EXT, video_url)
2021                 if umobj is None:
2022                     raise ValueError('Can not determine filename extension')
2023                 ext = umobj.group(1)
2024
2025                 info = {
2026                     'id': data['item_id'],
2027                     'url': video_url,
2028                     'uploader': data['display_name'],
2029                     'upload_date': upload_date,
2030                     'title': data['title'],
2031                     'ext': ext,
2032                     'format': data['media']['mimeType'],
2033                     'thumbnail': data['thumbnailUrl'],
2034                     'description': data['description'],
2035                     'player_url': data['embedUrl'],
2036                     'user_agent': 'iTunes/10.6.1',
2037                 }
2038             except (ValueError,KeyError) as err:
2039                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2040
2041         return [info]
2042
2043
2044 class MyVideoIE(InfoExtractor):
2045     """Information Extractor for myvideo.de."""
2046
2047     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2048     IE_NAME = u'myvideo'
2049
2050     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2051     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2052     # https://github.com/rg3/youtube-dl/pull/842
2053     def __rc4crypt(self,data, key):
2054         x = 0
2055         box = list(range(256))
2056         for i in list(range(256)):
2057             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2058             box[i], box[x] = box[x], box[i]
2059         x = 0
2060         y = 0
2061         out = ''
2062         for char in data:
2063             x = (x + 1) % 256
2064             y = (y + box[x]) % 256
2065             box[x], box[y] = box[y], box[x]
2066             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2067         return out
2068
2069     def __md5(self,s):
2070         return hashlib.md5(s).hexdigest().encode()
2071
2072     def _real_extract(self,url):
2073         mobj = re.match(self._VALID_URL, url)
2074         if mobj is None:
2075             raise ExtractorError(u'invalid URL: %s' % url)
2076
2077         video_id = mobj.group(1)
2078
2079         GK = (
2080           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2081           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2082           b'TnpsbA0KTVRkbU1tSTRNdz09'
2083         )
2084
2085         # Get video webpage
2086         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2087         webpage = self._download_webpage(webpage_url, video_id)
2088
2089         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2090         if mobj is not None:
2091             self.report_extraction(video_id)
2092             video_url = mobj.group(1) + '.flv'
2093
2094             video_title = self._html_search_regex('<title>([^<]+)</title>',
2095                 webpage, u'title')
2096
2097             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2098
2099             return [{
2100                 'id':       video_id,
2101                 'url':      video_url,
2102                 'uploader': None,
2103                 'upload_date':  None,
2104                 'title':    video_title,
2105                 'ext':      u'flv',
2106             }]
2107
2108         # try encxml
2109         mobj = re.search('var flashvars={(.+?)}', webpage)
2110         if mobj is None:
2111             raise ExtractorError(u'Unable to extract video')
2112
2113         params = {}
2114         encxml = ''
2115         sec = mobj.group(1)
2116         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2117             if not a == '_encxml':
2118                 params[a] = b
2119             else:
2120                 encxml = compat_urllib_parse.unquote(b)
2121         if not params.get('domain'):
2122             params['domain'] = 'www.myvideo.de'
2123         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2124         if 'flash_playertype=MTV' in xmldata_url:
2125             self._downloader.report_warning(u'avoiding MTV player')
2126             xmldata_url = (
2127                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2128                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2129             ) % video_id
2130
2131         # get enc data
2132         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2133         enc_data_b = binascii.unhexlify(enc_data)
2134         sk = self.__md5(
2135             base64.b64decode(base64.b64decode(GK)) +
2136             self.__md5(
2137                 str(video_id).encode('utf-8')
2138             )
2139         )
2140         dec_data = self.__rc4crypt(enc_data_b, sk)
2141
2142         # extracting infos
2143         self.report_extraction(video_id)
2144
2145         video_url = None
2146         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2147         if mobj:
2148             video_url = compat_urllib_parse.unquote(mobj.group(1))
2149             if 'myvideo2flash' in video_url:
2150                 self._downloader.report_warning(u'forcing RTMPT ...')
2151                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2152
2153         if not video_url:
2154             # extract non rtmp videos
2155             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2156             if mobj is None:
2157                 raise ExtractorError(u'unable to extract url')
2158             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2159
2160         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2161         video_file = compat_urllib_parse.unquote(video_file)
2162
2163         if not video_file.endswith('f4m'):
2164             ppath, prefix = video_file.split('.')
2165             video_playpath = '%s:%s' % (prefix, ppath)
2166             video_hls_playlist = ''
2167         else:
2168             video_playpath = ''
2169             video_hls_playlist = (
2170                 video_filepath + video_file
2171             ).replace('.f4m', '.m3u8')
2172
2173         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2174         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2175
2176         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2177             webpage, u'title')
2178
2179         return [{
2180             'id':                 video_id,
2181             'url':                video_url,
2182             'tc_url':             video_url,
2183             'uploader':           None,
2184             'upload_date':        None,
2185             'title':              video_title,
2186             'ext':                u'flv',
2187             'play_path':          video_playpath,
2188             'video_file':         video_file,
2189             'video_hls_playlist': video_hls_playlist,
2190             'player_url':         video_swfobj,
2191         }]
2192
2193
2194 class ComedyCentralIE(InfoExtractor):
2195     """Information extractor for The Daily Show and Colbert Report """
2196
2197     # urls can be abbreviations like :thedailyshow or :colbert
2198     # urls for episodes like:
2199     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2200     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2201     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2202     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2203                       |(https?://)?(www\.)?
2204                           (?P<showname>thedailyshow|colbertnation)\.com/
2205                          (full-episodes/(?P<episode>.*)|
2206                           (?P<clip>
2207                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2208                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2209                      $"""
2210
2211     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2212
2213     _video_extensions = {
2214         '3500': 'mp4',
2215         '2200': 'mp4',
2216         '1700': 'mp4',
2217         '1200': 'mp4',
2218         '750': 'mp4',
2219         '400': 'mp4',
2220     }
2221     _video_dimensions = {
2222         '3500': '1280x720',
2223         '2200': '960x540',
2224         '1700': '768x432',
2225         '1200': '640x360',
2226         '750': '512x288',
2227         '400': '384x216',
2228     }
2229
2230     @classmethod
2231     def suitable(cls, url):
2232         """Receives a URL and returns True if suitable for this IE."""
2233         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2234
2235     def _print_formats(self, formats):
2236         print('Available formats:')
2237         for x in formats:
2238             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2239
2240
2241     def _real_extract(self, url):
2242         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2243         if mobj is None:
2244             raise ExtractorError(u'Invalid URL: %s' % url)
2245
2246         if mobj.group('shortname'):
2247             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2248                 url = u'http://www.thedailyshow.com/full-episodes/'
2249             else:
2250                 url = u'http://www.colbertnation.com/full-episodes/'
2251             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2252             assert mobj is not None
2253
2254         if mobj.group('clip'):
2255             if mobj.group('showname') == 'thedailyshow':
2256                 epTitle = mobj.group('tdstitle')
2257             else:
2258                 epTitle = mobj.group('cntitle')
2259             dlNewest = False
2260         else:
2261             dlNewest = not mobj.group('episode')
2262             if dlNewest:
2263                 epTitle = mobj.group('showname')
2264             else:
2265                 epTitle = mobj.group('episode')
2266
2267         self.report_extraction(epTitle)
2268         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2269         if dlNewest:
2270             url = htmlHandle.geturl()
2271             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2272             if mobj is None:
2273                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2274             if mobj.group('episode') == '':
2275                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2276             epTitle = mobj.group('episode')
2277
2278         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2279
2280         if len(mMovieParams) == 0:
2281             # The Colbert Report embeds the information in a without
2282             # a URL prefix; so extract the alternate reference
2283             # and then add the URL prefix manually.
2284
2285             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2286             if len(altMovieParams) == 0:
2287                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2288             else:
2289                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2290
2291         uri = mMovieParams[0][1]
2292         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2293         indexXml = self._download_webpage(indexUrl, epTitle,
2294                                           u'Downloading show index',
2295                                           u'unable to download episode index')
2296
2297         results = []
2298
2299         idoc = xml.etree.ElementTree.fromstring(indexXml)
2300         itemEls = idoc.findall('.//item')
2301         for partNum,itemEl in enumerate(itemEls):
2302             mediaId = itemEl.findall('./guid')[0].text
2303             shortMediaId = mediaId.split(':')[-1]
2304             showId = mediaId.split(':')[-2].replace('.com', '')
2305             officialTitle = itemEl.findall('./title')[0].text
2306             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2307
2308             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2309                         compat_urllib_parse.urlencode({'uri': mediaId}))
2310             configXml = self._download_webpage(configUrl, epTitle,
2311                                                u'Downloading configuration for %s' % shortMediaId)
2312
2313             cdoc = xml.etree.ElementTree.fromstring(configXml)
2314             turls = []
2315             for rendition in cdoc.findall('.//rendition'):
2316                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2317                 turls.append(finfo)
2318
2319             if len(turls) == 0:
2320                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2321                 continue
2322
2323             if self._downloader.params.get('listformats', None):
2324                 self._print_formats([i[0] for i in turls])
2325                 return
2326
2327             # For now, just pick the highest bitrate
2328             format,rtmp_video_url = turls[-1]
2329
2330             # Get the format arg from the arg stream
2331             req_format = self._downloader.params.get('format', None)
2332
2333             # Select format if we can find one
2334             for f,v in turls:
2335                 if f == req_format:
2336                     format, rtmp_video_url = f, v
2337                     break
2338
2339             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2340             if not m:
2341                 raise ExtractorError(u'Cannot transform RTMP url')
2342             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2343             video_url = base + m.group('finalid')
2344
2345             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2346             info = {
2347                 'id': shortMediaId,
2348                 'url': video_url,
2349                 'uploader': showId,
2350                 'upload_date': officialDate,
2351                 'title': effTitle,
2352                 'ext': 'mp4',
2353                 'format': format,
2354                 'thumbnail': None,
2355                 'description': officialTitle,
2356             }
2357             results.append(info)
2358
2359         return results
2360
2361
2362 class EscapistIE(InfoExtractor):
2363     """Information extractor for The Escapist """
2364
2365     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2366     IE_NAME = u'escapist'
2367
2368     def _real_extract(self, url):
2369         mobj = re.match(self._VALID_URL, url)
2370         if mobj is None:
2371             raise ExtractorError(u'Invalid URL: %s' % url)
2372         showName = mobj.group('showname')
2373         videoId = mobj.group('episode')
2374
2375         self.report_extraction(videoId)
2376         webpage = self._download_webpage(url, videoId)
2377
2378         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2379             webpage, u'description', fatal=False)
2380
2381         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2382             webpage, u'thumbnail', fatal=False)
2383
2384         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2385             webpage, u'player url')
2386
2387         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2388             webpage, u'player url').split(' : ')[-1]
2389
2390         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2391         configUrl = compat_urllib_parse.unquote(configUrl)
2392
2393         configJSON = self._download_webpage(configUrl, videoId,
2394                                             u'Downloading configuration',
2395                                             u'unable to download configuration')
2396
2397         # Technically, it's JavaScript, not JSON
2398         configJSON = configJSON.replace("'", '"')
2399
2400         try:
2401             config = json.loads(configJSON)
2402         except (ValueError,) as err:
2403             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2404
2405         playlist = config['playlist']
2406         videoUrl = playlist[1]['url']
2407
2408         info = {
2409             'id': videoId,
2410             'url': videoUrl,
2411             'uploader': showName,
2412             'upload_date': None,
2413             'title': title,
2414             'ext': 'mp4',
2415             'thumbnail': imgUrl,
2416             'description': videoDesc,
2417             'player_url': playerUrl,
2418         }
2419
2420         return [info]
2421
2422 class CollegeHumorIE(InfoExtractor):
2423     """Information extractor for collegehumor.com"""
2424
2425     _WORKING = False
2426     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2427     IE_NAME = u'collegehumor'
2428
2429     def report_manifest(self, video_id):
2430         """Report information extraction."""
2431         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2432
2433     def _real_extract(self, url):
2434         mobj = re.match(self._VALID_URL, url)
2435         if mobj is None:
2436             raise ExtractorError(u'Invalid URL: %s' % url)
2437         video_id = mobj.group('videoid')
2438
2439         info = {
2440             'id': video_id,
2441             'uploader': None,
2442             'upload_date': None,
2443         }
2444
2445         self.report_extraction(video_id)
2446         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2447         try:
2448             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2449         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2450             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2451
2452         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2453         try:
2454             videoNode = mdoc.findall('./video')[0]
2455             info['description'] = videoNode.findall('./description')[0].text
2456             info['title'] = videoNode.findall('./caption')[0].text
2457             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2458             manifest_url = videoNode.findall('./file')[0].text
2459         except IndexError:
2460             raise ExtractorError(u'Invalid metadata XML file')
2461
2462         manifest_url += '?hdcore=2.10.3'
2463         self.report_manifest(video_id)
2464         try:
2465             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2466         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2467             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2468
2469         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2470         try:
2471             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2472             node_id = media_node.attrib['url']
2473             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2474         except IndexError as err:
2475             raise ExtractorError(u'Invalid manifest file')
2476
2477         url_pr = compat_urllib_parse_urlparse(manifest_url)
2478         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2479
2480         info['url'] = url
2481         info['ext'] = 'f4f'
2482         return [info]
2483
2484
2485 class XVideosIE(InfoExtractor):
2486     """Information extractor for xvideos.com"""
2487
2488     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2489     IE_NAME = u'xvideos'
2490
2491     def _real_extract(self, url):
2492         mobj = re.match(self._VALID_URL, url)
2493         if mobj is None:
2494             raise ExtractorError(u'Invalid URL: %s' % url)
2495         video_id = mobj.group(1)
2496
2497         webpage = self._download_webpage(url, video_id)
2498
2499         self.report_extraction(video_id)
2500
2501         # Extract video URL
2502         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2503             webpage, u'video URL'))
2504
2505         # Extract title
2506         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2507             webpage, u'title')
2508
2509         # Extract video thumbnail
2510         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2511             webpage, u'thumbnail', fatal=False)
2512
2513         info = {
2514             'id': video_id,
2515             'url': video_url,
2516             'uploader': None,
2517             'upload_date': None,
2518             'title': video_title,
2519             'ext': 'flv',
2520             'thumbnail': video_thumbnail,
2521             'description': None,
2522         }
2523
2524         return [info]
2525
2526
2527 class SoundcloudIE(InfoExtractor):
2528     """Information extractor for soundcloud.com
2529        To access the media, the uid of the song and a stream token
2530        must be extracted from the page source and the script must make
2531        a request to media.soundcloud.com/crossdomain.xml. Then
2532        the media can be grabbed by requesting from an url composed
2533        of the stream token and uid
2534      """
2535
2536     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2537     IE_NAME = u'soundcloud'
2538
2539     def report_resolve(self, video_id):
2540         """Report information extraction."""
2541         self.to_screen(u'%s: Resolving id' % video_id)
2542
2543     def _real_extract(self, url):
2544         mobj = re.match(self._VALID_URL, url)
2545         if mobj is None:
2546             raise ExtractorError(u'Invalid URL: %s' % url)
2547
2548         # extract uploader (which is in the url)
2549         uploader = mobj.group(1)
2550         # extract simple title (uploader + slug of song title)
2551         slug_title =  mobj.group(2)
2552         simple_title = uploader + u'-' + slug_title
2553         full_title = '%s/%s' % (uploader, slug_title)
2554
2555         self.report_resolve(full_title)
2556
2557         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2558         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2559         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2560
2561         info = json.loads(info_json)
2562         video_id = info['id']
2563         self.report_extraction(full_title)
2564
2565         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2566         stream_json = self._download_webpage(streams_url, full_title,
2567                                              u'Downloading stream definitions',
2568                                              u'unable to download stream definitions')
2569
2570         streams = json.loads(stream_json)
2571         mediaURL = streams['http_mp3_128_url']
2572         upload_date = unified_strdate(info['created_at'])
2573
2574         return [{
2575             'id':       info['id'],
2576             'url':      mediaURL,
2577             'uploader': info['user']['username'],
2578             'upload_date': upload_date,
2579             'title':    info['title'],
2580             'ext':      u'mp3',
2581             'description': info['description'],
2582         }]
2583
2584 class SoundcloudSetIE(InfoExtractor):
2585     """Information extractor for soundcloud.com sets
2586        To access the media, the uid of the song and a stream token
2587        must be extracted from the page source and the script must make
2588        a request to media.soundcloud.com/crossdomain.xml. Then
2589        the media can be grabbed by requesting from an url composed
2590        of the stream token and uid
2591      """
2592
2593     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2594     IE_NAME = u'soundcloud:set'
2595
2596     def report_resolve(self, video_id):
2597         """Report information extraction."""
2598         self.to_screen(u'%s: Resolving id' % video_id)
2599
2600     def _real_extract(self, url):
2601         mobj = re.match(self._VALID_URL, url)
2602         if mobj is None:
2603             raise ExtractorError(u'Invalid URL: %s' % url)
2604
2605         # extract uploader (which is in the url)
2606         uploader = mobj.group(1)
2607         # extract simple title (uploader + slug of song title)
2608         slug_title =  mobj.group(2)
2609         simple_title = uploader + u'-' + slug_title
2610         full_title = '%s/sets/%s' % (uploader, slug_title)
2611
2612         self.report_resolve(full_title)
2613
2614         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2615         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2616         info_json = self._download_webpage(resolv_url, full_title)
2617
2618         videos = []
2619         info = json.loads(info_json)
2620         if 'errors' in info:
2621             for err in info['errors']:
2622                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2623             return
2624
2625         self.report_extraction(full_title)
2626         for track in info['tracks']:
2627             video_id = track['id']
2628
2629             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2630             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2631
2632             self.report_extraction(video_id)
2633             streams = json.loads(stream_json)
2634             mediaURL = streams['http_mp3_128_url']
2635
2636             videos.append({
2637                 'id':       video_id,
2638                 'url':      mediaURL,
2639                 'uploader': track['user']['username'],
2640                 'upload_date':  unified_strdate(track['created_at']),
2641                 'title':    track['title'],
2642                 'ext':      u'mp3',
2643                 'description': track['description'],
2644             })
2645         return videos
2646
2647
2648 class InfoQIE(InfoExtractor):
2649     """Information extractor for infoq.com"""
2650     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2651
2652     def _real_extract(self, url):
2653         mobj = re.match(self._VALID_URL, url)
2654         if mobj is None:
2655             raise ExtractorError(u'Invalid URL: %s' % url)
2656
2657         webpage = self._download_webpage(url, video_id=url)
2658         self.report_extraction(url)
2659
2660         # Extract video URL
2661         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2662         if mobj is None:
2663             raise ExtractorError(u'Unable to extract video url')
2664         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2665         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2666
2667         # Extract title
2668         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2669             webpage, u'title')
2670
2671         # Extract description
2672         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2673             webpage, u'description', fatal=False)
2674
2675         video_filename = video_url.split('/')[-1]
2676         video_id, extension = video_filename.split('.')
2677
2678         info = {
2679             'id': video_id,
2680             'url': video_url,
2681             'uploader': None,
2682             'upload_date': None,
2683             'title': video_title,
2684             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2685             'thumbnail': None,
2686             'description': video_description,
2687         }
2688
2689         return [info]
2690
2691 class MixcloudIE(InfoExtractor):
2692     """Information extractor for www.mixcloud.com"""
2693
2694     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2695     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2696     IE_NAME = u'mixcloud'
2697
2698     def report_download_json(self, file_id):
2699         """Report JSON download."""
2700         self.to_screen(u'Downloading json')
2701
2702     def get_urls(self, jsonData, fmt, bitrate='best'):
2703         """Get urls from 'audio_formats' section in json"""
2704         file_url = None
2705         try:
2706             bitrate_list = jsonData[fmt]
2707             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2708                 bitrate = max(bitrate_list) # select highest
2709
2710             url_list = jsonData[fmt][bitrate]
2711         except TypeError: # we have no bitrate info.
2712             url_list = jsonData[fmt]
2713         return url_list
2714
2715     def check_urls(self, url_list):
2716         """Returns 1st active url from list"""
2717         for url in url_list:
2718             try:
2719                 compat_urllib_request.urlopen(url)
2720                 return url
2721             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2722                 url = None
2723
2724         return None
2725
2726     def _print_formats(self, formats):
2727         print('Available formats:')
2728         for fmt in formats.keys():
2729             for b in formats[fmt]:
2730                 try:
2731                     ext = formats[fmt][b][0]
2732                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2733                 except TypeError: # we have no bitrate info
2734                     ext = formats[fmt][0]
2735                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2736                     break
2737
2738     def _real_extract(self, url):
2739         mobj = re.match(self._VALID_URL, url)
2740         if mobj is None:
2741             raise ExtractorError(u'Invalid URL: %s' % url)
2742         # extract uploader & filename from url
2743         uploader = mobj.group(1).decode('utf-8')
2744         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2745
2746         # construct API request
2747         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2748         # retrieve .json file with links to files
2749         request = compat_urllib_request.Request(file_url)
2750         try:
2751             self.report_download_json(file_url)
2752             jsonData = compat_urllib_request.urlopen(request).read()
2753         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2754             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2755
2756         # parse JSON
2757         json_data = json.loads(jsonData)
2758         player_url = json_data['player_swf_url']
2759         formats = dict(json_data['audio_formats'])
2760
2761         req_format = self._downloader.params.get('format', None)
2762         bitrate = None
2763
2764         if self._downloader.params.get('listformats', None):
2765             self._print_formats(formats)
2766             return
2767
2768         if req_format is None or req_format == 'best':
2769             for format_param in formats.keys():
2770                 url_list = self.get_urls(formats, format_param)
2771                 # check urls
2772                 file_url = self.check_urls(url_list)
2773                 if file_url is not None:
2774                     break # got it!
2775         else:
2776             if req_format not in formats:
2777                 raise ExtractorError(u'Format is not available')
2778
2779             url_list = self.get_urls(formats, req_format)
2780             file_url = self.check_urls(url_list)
2781             format_param = req_format
2782
2783         return [{
2784             'id': file_id.decode('utf-8'),
2785             'url': file_url.decode('utf-8'),
2786             'uploader': uploader.decode('utf-8'),
2787             'upload_date': None,
2788             'title': json_data['name'],
2789             'ext': file_url.split('.')[-1].decode('utf-8'),
2790             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2791             'thumbnail': json_data['thumbnail_url'],
2792             'description': json_data['description'],
2793             'player_url': player_url.decode('utf-8'),
2794         }]
2795
2796 class StanfordOpenClassroomIE(InfoExtractor):
2797     """Information extractor for Stanford's Open ClassRoom"""
2798
2799     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2800     IE_NAME = u'stanfordoc'
2801
2802     def _real_extract(self, url):
2803         mobj = re.match(self._VALID_URL, url)
2804         if mobj is None:
2805             raise ExtractorError(u'Invalid URL: %s' % url)
2806
2807         if mobj.group('course') and mobj.group('video'): # A specific video
2808             course = mobj.group('course')
2809             video = mobj.group('video')
2810             info = {
2811                 'id': course + '_' + video,
2812                 'uploader': None,
2813                 'upload_date': None,
2814             }
2815
2816             self.report_extraction(info['id'])
2817             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2818             xmlUrl = baseUrl + video + '.xml'
2819             try:
2820                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2821             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2822                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2823             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2824             try:
2825                 info['title'] = mdoc.findall('./title')[0].text
2826                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2827             except IndexError:
2828                 raise ExtractorError(u'Invalid metadata XML file')
2829             info['ext'] = info['url'].rpartition('.')[2]
2830             return [info]
2831         elif mobj.group('course'): # A course page
2832             course = mobj.group('course')
2833             info = {
2834                 'id': course,
2835                 'type': 'playlist',
2836                 'uploader': None,
2837                 'upload_date': None,
2838             }
2839
2840             coursepage = self._download_webpage(url, info['id'],
2841                                         note='Downloading course info page',
2842                                         errnote='Unable to download course info page')
2843
2844             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2845
2846             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2847                 coursepage, u'description', fatal=False)
2848
2849             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2850             info['list'] = [
2851                 {
2852                     'type': 'reference',
2853                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2854                 }
2855                     for vpage in links]
2856             results = []
2857             for entry in info['list']:
2858                 assert entry['type'] == 'reference'
2859                 results += self.extract(entry['url'])
2860             return results
2861         else: # Root page
2862             info = {
2863                 'id': 'Stanford OpenClassroom',
2864                 'type': 'playlist',
2865                 'uploader': None,
2866                 'upload_date': None,
2867             }
2868
2869             self.report_download_webpage(info['id'])
2870             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2871             try:
2872                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2873             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2874                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2875
2876             info['title'] = info['id']
2877
2878             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2879             info['list'] = [
2880                 {
2881                     'type': 'reference',
2882                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2883                 }
2884                     for cpage in links]
2885
2886             results = []
2887             for entry in info['list']:
2888                 assert entry['type'] == 'reference'
2889                 results += self.extract(entry['url'])
2890             return results
2891
2892 class MTVIE(InfoExtractor):
2893     """Information extractor for MTV.com"""
2894
2895     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2896     IE_NAME = u'mtv'
2897
2898     def _real_extract(self, url):
2899         mobj = re.match(self._VALID_URL, url)
2900         if mobj is None:
2901             raise ExtractorError(u'Invalid URL: %s' % url)
2902         if not mobj.group('proto'):
2903             url = 'http://' + url
2904         video_id = mobj.group('videoid')
2905
2906         webpage = self._download_webpage(url, video_id)
2907
2908         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2909             webpage, u'song name', fatal=False)
2910
2911         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2912             webpage, u'title')
2913
2914         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2915             webpage, u'mtvn_uri', fatal=False)
2916
2917         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2918             webpage, u'content id', fatal=False)
2919
2920         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2921         self.report_extraction(video_id)
2922         request = compat_urllib_request.Request(videogen_url)
2923         try:
2924             metadataXml = compat_urllib_request.urlopen(request).read()
2925         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2926             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2927
2928         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2929         renditions = mdoc.findall('.//rendition')
2930
2931         # For now, always pick the highest quality.
2932         rendition = renditions[-1]
2933
2934         try:
2935             _,_,ext = rendition.attrib['type'].partition('/')
2936             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2937             video_url = rendition.find('./src').text
2938         except KeyError:
2939             raise ExtractorError('Invalid rendition field.')
2940
2941         info = {
2942             'id': video_id,
2943             'url': video_url,
2944             'uploader': performer,
2945             'upload_date': None,
2946             'title': video_title,
2947             'ext': ext,
2948             'format': format,
2949         }
2950
2951         return [info]
2952
2953
2954 class YoukuIE(InfoExtractor):
2955     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2956
2957     def _gen_sid(self):
2958         nowTime = int(time.time() * 1000)
2959         random1 = random.randint(1000,1998)
2960         random2 = random.randint(1000,9999)
2961
2962         return "%d%d%d" %(nowTime,random1,random2)
2963
2964     def _get_file_ID_mix_string(self, seed):
2965         mixed = []
2966         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2967         seed = float(seed)
2968         for i in range(len(source)):
2969             seed  =  (seed * 211 + 30031 ) % 65536
2970             index  =  math.floor(seed / 65536 * len(source) )
2971             mixed.append(source[int(index)])
2972             source.remove(source[int(index)])
2973         #return ''.join(mixed)
2974         return mixed
2975
2976     def _get_file_id(self, fileId, seed):
2977         mixed = self._get_file_ID_mix_string(seed)
2978         ids = fileId.split('*')
2979         realId = []
2980         for ch in ids:
2981             if ch:
2982                 realId.append(mixed[int(ch)])
2983         return ''.join(realId)
2984
2985     def _real_extract(self, url):
2986         mobj = re.match(self._VALID_URL, url)
2987         if mobj is None:
2988             raise ExtractorError(u'Invalid URL: %s' % url)
2989         video_id = mobj.group('ID')
2990
2991         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2992
2993         jsondata = self._download_webpage(info_url, video_id)
2994
2995         self.report_extraction(video_id)
2996         try:
2997             config = json.loads(jsondata)
2998
2999             video_title =  config['data'][0]['title']
3000             seed = config['data'][0]['seed']
3001
3002             format = self._downloader.params.get('format', None)
3003             supported_format = list(config['data'][0]['streamfileids'].keys())
3004
3005             if format is None or format == 'best':
3006                 if 'hd2' in supported_format:
3007                     format = 'hd2'
3008                 else:
3009                     format = 'flv'
3010                 ext = u'flv'
3011             elif format == 'worst':
3012                 format = 'mp4'
3013                 ext = u'mp4'
3014             else:
3015                 format = 'flv'
3016                 ext = u'flv'
3017
3018
3019             fileid = config['data'][0]['streamfileids'][format]
3020             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3021         except (UnicodeDecodeError, ValueError, KeyError):
3022             raise ExtractorError(u'Unable to extract info section')
3023
3024         files_info=[]
3025         sid = self._gen_sid()
3026         fileid = self._get_file_id(fileid, seed)
3027
3028         #column 8,9 of fileid represent the segment number
3029         #fileid[7:9] should be changed
3030         for index, key in enumerate(keys):
3031
3032             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3033             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3034
3035             info = {
3036                 'id': '%s_part%02d' % (video_id, index),
3037                 'url': download_url,
3038                 'uploader': None,
3039                 'upload_date': None,
3040                 'title': video_title,
3041                 'ext': ext,
3042             }
3043             files_info.append(info)
3044
3045         return files_info
3046
3047
3048 class XNXXIE(InfoExtractor):
3049     """Information extractor for xnxx.com"""
3050
3051     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3052     IE_NAME = u'xnxx'
3053     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3054     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3055     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3056
3057     def _real_extract(self, url):
3058         mobj = re.match(self._VALID_URL, url)
3059         if mobj is None:
3060             raise ExtractorError(u'Invalid URL: %s' % url)
3061         video_id = mobj.group(1)
3062
3063         # Get webpage content
3064         webpage = self._download_webpage(url, video_id)
3065
3066         video_url = self._search_regex(self.VIDEO_URL_RE,
3067             webpage, u'video URL')
3068         video_url = compat_urllib_parse.unquote(video_url)
3069
3070         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3071             webpage, u'title')
3072
3073         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3074             webpage, u'thumbnail', fatal=False)
3075
3076         return [{
3077             'id': video_id,
3078             'url': video_url,
3079             'uploader': None,
3080             'upload_date': None,
3081             'title': video_title,
3082             'ext': 'flv',
3083             'thumbnail': video_thumbnail,
3084             'description': None,
3085         }]
3086
3087
3088 class GooglePlusIE(InfoExtractor):
3089     """Information extractor for plus.google.com."""
3090
3091     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3092     IE_NAME = u'plus.google'
3093
3094     def _real_extract(self, url):
3095         # Extract id from URL
3096         mobj = re.match(self._VALID_URL, url)
3097         if mobj is None:
3098             raise ExtractorError(u'Invalid URL: %s' % url)
3099
3100         post_url = mobj.group(0)
3101         video_id = mobj.group(1)
3102
3103         video_extension = 'flv'
3104
3105         # Step 1, Retrieve post webpage to extract further information
3106         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3107
3108         self.report_extraction(video_id)
3109
3110         # Extract update date
3111         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3112             webpage, u'upload date', fatal=False)
3113         if upload_date:
3114             # Convert timestring to a format suitable for filename
3115             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3116             upload_date = upload_date.strftime('%Y%m%d')
3117
3118         # Extract uploader
3119         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3120             webpage, u'uploader', fatal=False)
3121
3122         # Extract title
3123         # Get the first line for title
3124         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3125             webpage, 'title', default=u'NA')
3126
3127         # Step 2, Stimulate clicking the image box to launch video
3128         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3129             webpage, u'video page URL')
3130         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3131
3132         # Extract video links on video page
3133         """Extract video links of all sizes"""
3134         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3135         mobj = re.findall(pattern, webpage)
3136         if len(mobj) == 0:
3137             raise ExtractorError(u'Unable to extract video links')
3138
3139         # Sort in resolution
3140         links = sorted(mobj)
3141
3142         # Choose the lowest of the sort, i.e. highest resolution
3143         video_url = links[-1]
3144         # Only get the url. The resolution part in the tuple has no use anymore
3145         video_url = video_url[-1]
3146         # Treat escaped \u0026 style hex
3147         try:
3148             video_url = video_url.decode("unicode_escape")
3149         except AttributeError: # Python 3
3150             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3151
3152
3153         return [{
3154             'id':       video_id,
3155             'url':      video_url,
3156             'uploader': uploader,
3157             'upload_date':  upload_date,
3158             'title':    video_title,
3159             'ext':      video_extension,
3160         }]
3161
3162 class NBAIE(InfoExtractor):
3163     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3164     IE_NAME = u'nba'
3165
3166     def _real_extract(self, url):
3167         mobj = re.match(self._VALID_URL, url)
3168         if mobj is None:
3169             raise ExtractorError(u'Invalid URL: %s' % url)
3170
3171         video_id = mobj.group(1)
3172
3173         webpage = self._download_webpage(url, video_id)
3174
3175         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3176
3177         shortened_video_id = video_id.rpartition('/')[2]
3178         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3179             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3180
3181         # It isn't there in the HTML it returns to us
3182         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3183
3184         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3185
3186         info = {
3187             'id': shortened_video_id,
3188             'url': video_url,
3189             'ext': 'mp4',
3190             'title': title,
3191             # 'uploader_date': uploader_date,
3192             'description': description,
3193         }
3194         return [info]
3195
3196 class JustinTVIE(InfoExtractor):
3197     """Information extractor for justin.tv and twitch.tv"""
3198     # TODO: One broadcast may be split into multiple videos. The key
3199     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3200     # starts at 1 and increases. Can we treat all parts as one video?
3201
3202     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3203         (?:
3204             (?P<channelid>[^/]+)|
3205             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3206             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3207         )
3208         /?(?:\#.*)?$
3209         """
3210     _JUSTIN_PAGE_LIMIT = 100
3211     IE_NAME = u'justin.tv'
3212
3213     def report_download_page(self, channel, offset):
3214         """Report attempt to download a single page of videos."""
3215         self.to_screen(u'%s: Downloading video information from %d to %d' %
3216                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3217
3218     # Return count of items, list of *valid* items
3219     def _parse_page(self, url, video_id):
3220         webpage = self._download_webpage(url, video_id,
3221                                          u'Downloading video info JSON',
3222                                          u'unable to download video info JSON')
3223
3224         response = json.loads(webpage)
3225         if type(response) != list:
3226             error_text = response.get('error', 'unknown error')
3227             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3228         info = []
3229         for clip in response:
3230             video_url = clip['video_file_url']
3231             if video_url:
3232                 video_extension = os.path.splitext(video_url)[1][1:]
3233                 video_date = re.sub('-', '', clip['start_time'][:10])
3234                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3235                 video_id = clip['id']
3236                 video_title = clip.get('title', video_id)
3237                 info.append({
3238                     'id': video_id,
3239                     'url': video_url,
3240                     'title': video_title,
3241                     'uploader': clip.get('channel_name', video_uploader_id),
3242                     'uploader_id': video_uploader_id,
3243                     'upload_date': video_date,
3244                     'ext': video_extension,
3245                 })
3246         return (len(response), info)
3247
3248     def _real_extract(self, url):
3249         mobj = re.match(self._VALID_URL, url)
3250         if mobj is None:
3251             raise ExtractorError(u'invalid URL: %s' % url)
3252
3253         api_base = 'http://api.justin.tv'
3254         paged = False
3255         if mobj.group('channelid'):
3256             paged = True
3257             video_id = mobj.group('channelid')
3258             api = api_base + '/channel/archives/%s.json' % video_id
3259         elif mobj.group('chapterid'):
3260             chapter_id = mobj.group('chapterid')
3261
3262             webpage = self._download_webpage(url, chapter_id)
3263             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3264             if not m:
3265                 raise ExtractorError(u'Cannot find archive of a chapter')
3266             archive_id = m.group(1)
3267
3268             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3269             chapter_info_xml = self._download_webpage(api, chapter_id,
3270                                              note=u'Downloading chapter information',
3271                                              errnote=u'Chapter information download failed')
3272             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3273             for a in doc.findall('.//archive'):
3274                 if archive_id == a.find('./id').text:
3275                     break
3276             else:
3277                 raise ExtractorError(u'Could not find chapter in chapter information')
3278
3279             video_url = a.find('./video_file_url').text
3280             video_ext = video_url.rpartition('.')[2] or u'flv'
3281
3282             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3283             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3284                                    note='Downloading chapter metadata',
3285                                    errnote='Download of chapter metadata failed')
3286             chapter_info = json.loads(chapter_info_json)
3287
3288             bracket_start = int(doc.find('.//bracket_start').text)
3289             bracket_end = int(doc.find('.//bracket_end').text)
3290
3291             # TODO determine start (and probably fix up file)
3292             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3293             #video_url += u'?start=' + TODO:start_timestamp
3294             # bracket_start is 13290, but we want 51670615
3295             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3296                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3297
3298             info = {
3299                 'id': u'c' + chapter_id,
3300                 'url': video_url,
3301                 'ext': video_ext,
3302                 'title': chapter_info['title'],
3303                 'thumbnail': chapter_info['preview'],
3304                 'description': chapter_info['description'],
3305                 'uploader': chapter_info['channel']['display_name'],
3306                 'uploader_id': chapter_info['channel']['name'],
3307             }
3308             return [info]
3309         else:
3310             video_id = mobj.group('videoid')
3311             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3312
3313         self.report_extraction(video_id)
3314
3315         info = []
3316         offset = 0
3317         limit = self._JUSTIN_PAGE_LIMIT
3318         while True:
3319             if paged:
3320                 self.report_download_page(video_id, offset)
3321             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3322             page_count, page_info = self._parse_page(page_url, video_id)
3323             info.extend(page_info)
3324             if not paged or page_count != limit:
3325                 break
3326             offset += limit
3327         return info
3328
3329 class FunnyOrDieIE(InfoExtractor):
3330     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3331
3332     def _real_extract(self, url):
3333         mobj = re.match(self._VALID_URL, url)
3334         if mobj is None:
3335             raise ExtractorError(u'invalid URL: %s' % url)
3336
3337         video_id = mobj.group('id')
3338         webpage = self._download_webpage(url, video_id)
3339
3340         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3341             webpage, u'video URL', flags=re.DOTALL)
3342
3343         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3344             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3345
3346         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3347             webpage, u'description', fatal=False, flags=re.DOTALL)
3348
3349         info = {
3350             'id': video_id,
3351             'url': video_url,
3352             'ext': 'mp4',
3353             'title': title,
3354             'description': video_description,
3355         }
3356         return [info]
3357
3358 class SteamIE(InfoExtractor):
3359     _VALID_URL = r"""http://store\.steampowered\.com/
3360                 (agecheck/)?
3361                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3362                 (?P<gameID>\d+)/?
3363                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3364                 """
3365     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3366     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3367
3368     @classmethod
3369     def suitable(cls, url):
3370         """Receives a URL and returns True if suitable for this IE."""
3371         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3372
3373     def _real_extract(self, url):
3374         m = re.match(self._VALID_URL, url, re.VERBOSE)
3375         gameID = m.group('gameID')
3376
3377         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3378         webpage = self._download_webpage(videourl, gameID)
3379
3380         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3381             videourl = self._AGECHECK_TEMPLATE % gameID
3382             self.report_age_confirmation()
3383             webpage = self._download_webpage(videourl, gameID)
3384
3385         self.report_extraction(gameID)
3386         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3387                                              webpage, 'game title')
3388
3389         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3390         mweb = re.finditer(urlRE, webpage)
3391         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3392         titles = re.finditer(namesRE, webpage)
3393         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3394         thumbs = re.finditer(thumbsRE, webpage)
3395         videos = []
3396         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3397             video_id = vid.group('videoID')
3398             title = vtitle.group('videoName')
3399             video_url = vid.group('videoURL')
3400             video_thumb = thumb.group('thumbnail')
3401             if not video_url:
3402                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3403             info = {
3404                 'id':video_id,
3405                 'url':video_url,
3406                 'ext': 'flv',
3407                 'title': unescapeHTML(title),
3408                 'thumbnail': video_thumb
3409                   }
3410             videos.append(info)
3411         return [self.playlist_result(videos, gameID, game_title)]
3412
3413 class UstreamIE(InfoExtractor):
3414     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3415     IE_NAME = u'ustream'
3416
3417     def _real_extract(self, url):
3418         m = re.match(self._VALID_URL, url)
3419         video_id = m.group('videoID')
3420
3421         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3422         webpage = self._download_webpage(url, video_id)
3423
3424         self.report_extraction(video_id)
3425
3426         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3427             webpage, u'title')
3428
3429         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3430             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3431
3432         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3433             webpage, u'thumbnail', fatal=False)
3434
3435         info = {
3436                 'id': video_id,
3437                 'url': video_url,
3438                 'ext': 'flv',
3439                 'title': video_title,
3440                 'uploader': uploader,
3441                 'thumbnail': thumbnail,
3442                }
3443         return info
3444
3445 class WorldStarHipHopIE(InfoExtractor):
3446     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3447     IE_NAME = u'WorldStarHipHop'
3448
3449     def _real_extract(self, url):
3450         m = re.match(self._VALID_URL, url)
3451         video_id = m.group('id')
3452
3453         webpage_src = self._download_webpage(url, video_id)
3454
3455         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3456             webpage_src, u'video URL')
3457
3458         if 'mp4' in video_url:
3459             ext = 'mp4'
3460         else:
3461             ext = 'flv'
3462
3463         video_title = self._html_search_regex(r"<title>(.*)</title>",
3464             webpage_src, u'title')
3465
3466         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3467         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3468             webpage_src, u'thumbnail', fatal=False)
3469
3470         if not thumbnail:
3471             _title = r"""candytitles.*>(.*)</span>"""
3472             mobj = re.search(_title, webpage_src)
3473             if mobj is not None:
3474                 video_title = mobj.group(1)
3475
3476         results = [{
3477                     'id': video_id,
3478                     'url' : video_url,
3479                     'title' : video_title,
3480                     'thumbnail' : thumbnail,
3481                     'ext' : ext,
3482                     }]
3483         return results
3484
3485 class RBMARadioIE(InfoExtractor):
3486     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3487
3488     def _real_extract(self, url):
3489         m = re.match(self._VALID_URL, url)
3490         video_id = m.group('videoID')
3491
3492         webpage = self._download_webpage(url, video_id)
3493
3494         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3495             webpage, u'json data', flags=re.MULTILINE)
3496
3497         try:
3498             data = json.loads(json_data)
3499         except ValueError as e:
3500             raise ExtractorError(u'Invalid JSON: ' + str(e))
3501
3502         video_url = data['akamai_url'] + '&cbr=256'
3503         url_parts = compat_urllib_parse_urlparse(video_url)
3504         video_ext = url_parts.path.rpartition('.')[2]
3505         info = {
3506                 'id': video_id,
3507                 'url': video_url,
3508                 'ext': video_ext,
3509                 'title': data['title'],
3510                 'description': data.get('teaser_text'),
3511                 'location': data.get('country_of_origin'),
3512                 'uploader': data.get('host', {}).get('name'),
3513                 'uploader_id': data.get('host', {}).get('slug'),
3514                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3515                 'duration': data.get('duration'),
3516         }
3517         return [info]
3518
3519
3520 class YouPornIE(InfoExtractor):
3521     """Information extractor for youporn.com."""
3522     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3523
3524     def _print_formats(self, formats):
3525         """Print all available formats"""
3526         print(u'Available formats:')
3527         print(u'ext\t\tformat')
3528         print(u'---------------------------------')
3529         for format in formats:
3530             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3531
3532     def _specific(self, req_format, formats):
3533         for x in formats:
3534             if(x["format"]==req_format):
3535                 return x
3536         return None
3537
3538     def _real_extract(self, url):
3539         mobj = re.match(self._VALID_URL, url)
3540         if mobj is None:
3541             raise ExtractorError(u'Invalid URL: %s' % url)
3542         video_id = mobj.group('videoid')
3543
3544         req = compat_urllib_request.Request(url)
3545         req.add_header('Cookie', 'age_verified=1')
3546         webpage = self._download_webpage(req, video_id)
3547
3548         # Get JSON parameters
3549         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3550         try:
3551             params = json.loads(json_params)
3552         except:
3553             raise ExtractorError(u'Invalid JSON')
3554
3555         self.report_extraction(video_id)
3556         try:
3557             video_title = params['title']
3558             upload_date = unified_strdate(params['release_date_f'])
3559             video_description = params['description']
3560             video_uploader = params['submitted_by']
3561             thumbnail = params['thumbnails'][0]['image']
3562         except KeyError:
3563             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3564
3565         # Get all of the formats available
3566         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3567         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3568             webpage, u'download list').strip()
3569
3570         # Get all of the links from the page
3571         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3572         links = re.findall(LINK_RE, download_list_html)
3573         if(len(links) == 0):
3574             raise ExtractorError(u'ERROR: no known formats available for video')
3575
3576         self.to_screen(u'Links found: %d' % len(links))
3577
3578         formats = []
3579         for link in links:
3580
3581             # A link looks like this:
3582             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3583             # A path looks like this:
3584             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3585             video_url = unescapeHTML( link )
3586             path = compat_urllib_parse_urlparse( video_url ).path
3587             extension = os.path.splitext( path )[1][1:]
3588             format = path.split('/')[4].split('_')[:2]
3589             size = format[0]
3590             bitrate = format[1]
3591             format = "-".join( format )
3592             # title = u'%s-%s-%s' % (video_title, size, bitrate)
3593
3594             formats.append({
3595                 'id': video_id,
3596                 'url': video_url,
3597                 'uploader': video_uploader,
3598                 'upload_date': upload_date,
3599                 'title': video_title,
3600                 'ext': extension,
3601                 'format': format,
3602                 'thumbnail': thumbnail,
3603                 'description': video_description
3604             })
3605
3606         if self._downloader.params.get('listformats', None):
3607             self._print_formats(formats)
3608             return
3609
3610         req_format = self._downloader.params.get('format', None)
3611         self.to_screen(u'Format: %s' % req_format)
3612
3613         if req_format is None or req_format == 'best':
3614             return [formats[0]]
3615         elif req_format == 'worst':
3616             return [formats[-1]]
3617         elif req_format in ('-1', 'all'):
3618             return formats
3619         else:
3620             format = self._specific( req_format, formats )
3621             if result is None:
3622                 raise ExtractorError(u'Requested format not available')
3623             return [format]
3624
3625
3626
3627 class PornotubeIE(InfoExtractor):
3628     """Information extractor for pornotube.com."""
3629     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3630
3631     def _real_extract(self, url):
3632         mobj = re.match(self._VALID_URL, url)
3633         if mobj is None:
3634             raise ExtractorError(u'Invalid URL: %s' % url)
3635
3636         video_id = mobj.group('videoid')
3637         video_title = mobj.group('title')
3638
3639         # Get webpage content
3640         webpage = self._download_webpage(url, video_id)
3641
3642         # Get the video URL
3643         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3644         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3645         video_url = compat_urllib_parse.unquote(video_url)
3646
3647         #Get the uploaded date
3648         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3649         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3650         if upload_date: upload_date = unified_strdate(upload_date)
3651
3652         info = {'id': video_id,
3653                 'url': video_url,
3654                 'uploader': None,
3655                 'upload_date': upload_date,
3656                 'title': video_title,
3657                 'ext': 'flv',
3658                 'format': 'flv'}
3659
3660         return [info]
3661
3662 class YouJizzIE(InfoExtractor):
3663     """Information extractor for youjizz.com."""
3664     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3665
3666     def _real_extract(self, url):
3667         mobj = re.match(self._VALID_URL, url)
3668         if mobj is None:
3669             raise ExtractorError(u'Invalid URL: %s' % url)
3670
3671         video_id = mobj.group('videoid')
3672
3673         # Get webpage content
3674         webpage = self._download_webpage(url, video_id)
3675
3676         # Get the video title
3677         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3678             webpage, u'title').strip()
3679
3680         # Get the embed page
3681         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3682         if result is None:
3683             raise ExtractorError(u'ERROR: unable to extract embed page')
3684
3685         embed_page_url = result.group(0).strip()
3686         video_id = result.group('videoid')
3687
3688         webpage = self._download_webpage(embed_page_url, video_id)
3689
3690         # Get the video URL
3691         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3692             webpage, u'video URL')
3693
3694         info = {'id': video_id,
3695                 'url': video_url,
3696                 'title': video_title,
3697                 'ext': 'flv',
3698                 'format': 'flv',
3699                 'player_url': embed_page_url}
3700
3701         return [info]
3702
3703 class EightTracksIE(InfoExtractor):
3704     IE_NAME = '8tracks'
3705     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3706
3707     def _real_extract(self, url):
3708         mobj = re.match(self._VALID_URL, url)
3709         if mobj is None:
3710             raise ExtractorError(u'Invalid URL: %s' % url)
3711         playlist_id = mobj.group('id')
3712
3713         webpage = self._download_webpage(url, playlist_id)
3714
3715         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3716         data = json.loads(json_like)
3717
3718         session = str(random.randint(0, 1000000000))
3719         mix_id = data['id']
3720         track_count = data['tracks_count']
3721         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3722         next_url = first_url
3723         res = []
3724         for i in itertools.count():
3725             api_json = self._download_webpage(next_url, playlist_id,
3726                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3727                 errnote=u'Failed to download song information')
3728             api_data = json.loads(api_json)
3729             track_data = api_data[u'set']['track']
3730             info = {
3731                 'id': track_data['id'],
3732                 'url': track_data['track_file_stream_url'],
3733                 'title': track_data['performer'] + u' - ' + track_data['name'],
3734                 'raw_title': track_data['name'],
3735                 'uploader_id': data['user']['login'],
3736                 'ext': 'm4a',
3737             }
3738             res.append(info)
3739             if api_data['set']['at_last_track']:
3740                 break
3741             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3742         return res
3743
3744 class KeekIE(InfoExtractor):
3745     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3746     IE_NAME = u'keek'
3747
3748     def _real_extract(self, url):
3749         m = re.match(self._VALID_URL, url)
3750         video_id = m.group('videoID')
3751
3752         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3753         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3754         webpage = self._download_webpage(url, video_id)
3755
3756         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3757             webpage, u'title')
3758
3759         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3760             webpage, u'uploader', fatal=False)
3761
3762         info = {
3763                 'id': video_id,
3764                 'url': video_url,
3765                 'ext': 'mp4',
3766                 'title': video_title,
3767                 'thumbnail': thumbnail,
3768                 'uploader': uploader
3769         }
3770         return [info]
3771
3772 class TEDIE(InfoExtractor):
3773     _VALID_URL=r'''http://www\.ted\.com/
3774                    (
3775                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3776                         |
3777                         ((?P<type_talk>talks)) # We have a simple talk
3778                    )
3779                    (/lang/(.*?))? # The url may contain the language
3780                    /(?P<name>\w+) # Here goes the name and then ".html"
3781                    '''
3782
3783     @classmethod
3784     def suitable(cls, url):
3785         """Receives a URL and returns True if suitable for this IE."""
3786         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3787
3788     def _real_extract(self, url):
3789         m=re.match(self._VALID_URL, url, re.VERBOSE)
3790         if m.group('type_talk'):
3791             return [self._talk_info(url)]
3792         else :
3793             playlist_id=m.group('playlist_id')
3794             name=m.group('name')
3795             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3796             return [self._playlist_videos_info(url,name,playlist_id)]
3797
3798     def _playlist_videos_info(self,url,name,playlist_id=0):
3799         '''Returns the videos of the playlist'''
3800         video_RE=r'''
3801                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3802                      ([.\s]*?)data-playlist_item_id="(\d+)"
3803                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3804                      '''
3805         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3806         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3807         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3808         m_names=re.finditer(video_name_RE,webpage)
3809
3810         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3811                                                  webpage, 'playlist title')
3812
3813         playlist_entries = []
3814         for m_video, m_name in zip(m_videos,m_names):
3815             video_id=m_video.group('video_id')
3816             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3817             playlist_entries.append(self.url_result(talk_url, 'TED'))
3818         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3819
3820     def _talk_info(self, url, video_id=0):
3821         """Return the video for the talk in the url"""
3822         m = re.match(self._VALID_URL, url,re.VERBOSE)
3823         video_name = m.group('name')
3824         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3825         self.report_extraction(video_name)
3826         # If the url includes the language we get the title translated
3827         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3828                                         webpage, 'title')
3829         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3830                                     webpage, 'json data')
3831         info = json.loads(json_data)
3832         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3833                                        webpage, 'description', flags = re.DOTALL)
3834
3835         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3836                                        webpage, 'thumbnail')
3837         info = {
3838                 'id': info['id'],
3839                 'url': info['htmlStreams'][-1]['file'],
3840                 'ext': 'mp4',
3841                 'title': title,
3842                 'thumbnail': thumbnail,
3843                 'description': desc,
3844                 }
3845         return info
3846
3847 class MySpassIE(InfoExtractor):
3848     _VALID_URL = r'http://www.myspass.de/.*'
3849
3850     def _real_extract(self, url):
3851         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3852
3853         # video id is the last path element of the URL
3854         # usually there is a trailing slash, so also try the second but last
3855         url_path = compat_urllib_parse_urlparse(url).path
3856         url_parent_path, video_id = os.path.split(url_path)
3857         if not video_id:
3858             _, video_id = os.path.split(url_parent_path)
3859
3860         # get metadata
3861         metadata_url = META_DATA_URL_TEMPLATE % video_id
3862         metadata_text = self._download_webpage(metadata_url, video_id)
3863         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3864
3865         # extract values from metadata
3866         url_flv_el = metadata.find('url_flv')
3867         if url_flv_el is None:
3868             raise ExtractorError(u'Unable to extract download url')
3869         video_url = url_flv_el.text
3870         extension = os.path.splitext(video_url)[1][1:]
3871         title_el = metadata.find('title')
3872         if title_el is None:
3873             raise ExtractorError(u'Unable to extract title')
3874         title = title_el.text
3875         format_id_el = metadata.find('format_id')
3876         if format_id_el is None:
3877             format = ext
3878         else:
3879             format = format_id_el.text
3880         description_el = metadata.find('description')
3881         if description_el is not None:
3882             description = description_el.text
3883         else:
3884             description = None
3885         imagePreview_el = metadata.find('imagePreview')
3886         if imagePreview_el is not None:
3887             thumbnail = imagePreview_el.text
3888         else:
3889             thumbnail = None
3890         info = {
3891             'id': video_id,
3892             'url': video_url,
3893             'title': title,
3894             'ext': extension,
3895             'format': format,
3896             'thumbnail': thumbnail,
3897             'description': description
3898         }
3899         return [info]
3900
3901 class SpiegelIE(InfoExtractor):
3902     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3903
3904     def _real_extract(self, url):
3905         m = re.match(self._VALID_URL, url)
3906         video_id = m.group('videoID')
3907
3908         webpage = self._download_webpage(url, video_id)
3909
3910         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3911             webpage, u'title')
3912
3913         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3914         xml_code = self._download_webpage(xml_url, video_id,
3915                     note=u'Downloading XML', errnote=u'Failed to download XML')
3916
3917         idoc = xml.etree.ElementTree.fromstring(xml_code)
3918         last_type = idoc[-1]
3919         filename = last_type.findall('./filename')[0].text
3920         duration = float(last_type.findall('./duration')[0].text)
3921
3922         video_url = 'http://video2.spiegel.de/flash/' + filename
3923         video_ext = filename.rpartition('.')[2]
3924         info = {
3925             'id': video_id,
3926             'url': video_url,
3927             'ext': video_ext,
3928             'title': video_title,
3929             'duration': duration,
3930         }
3931         return [info]
3932
3933 class LiveLeakIE(InfoExtractor):
3934
3935     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3936     IE_NAME = u'liveleak'
3937
3938     def _real_extract(self, url):
3939         mobj = re.match(self._VALID_URL, url)
3940         if mobj is None:
3941             raise ExtractorError(u'Invalid URL: %s' % url)
3942
3943         video_id = mobj.group('video_id')
3944
3945         webpage = self._download_webpage(url, video_id)
3946
3947         video_url = self._search_regex(r'file: "(.*?)",',
3948             webpage, u'video URL')
3949
3950         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3951             webpage, u'title').replace('LiveLeak.com -', '').strip()
3952
3953         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3954             webpage, u'description', fatal=False)
3955
3956         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3957             webpage, u'uploader', fatal=False)
3958
3959         info = {
3960             'id':  video_id,
3961             'url': video_url,
3962             'ext': 'mp4',
3963             'title': video_title,
3964             'description': video_description,
3965             'uploader': video_uploader
3966         }
3967
3968         return [info]
3969
3970 class ARDIE(InfoExtractor):
3971     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3972     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3973     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3974
3975     def _real_extract(self, url):
3976         # determine video id from url
3977         m = re.match(self._VALID_URL, url)
3978
3979         numid = re.search(r'documentId=([0-9]+)', url)
3980         if numid:
3981             video_id = numid.group(1)
3982         else:
3983             video_id = m.group('video_id')
3984
3985         # determine title and media streams from webpage
3986         html = self._download_webpage(url, video_id)
3987         title = re.search(self._TITLE, html).group('title')
3988         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3989         if not streams:
3990             assert '"fsk"' in html
3991             raise ExtractorError(u'This video is only available after 8:00 pm')
3992
3993         # choose default media type and highest quality for now
3994         stream = max([s for s in streams if int(s["media_type"]) == 0],
3995                      key=lambda s: int(s["quality"]))
3996
3997         # there's two possibilities: RTMP stream or HTTP download
3998         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3999         if stream['rtmp_url']:
4000             self.to_screen(u'RTMP download detected')
4001             assert stream['video_url'].startswith('mp4:')
4002             info["url"] = stream["rtmp_url"]
4003             info["play_path"] = stream['video_url']
4004         else:
4005             assert stream["video_url"].endswith('.mp4')
4006             info["url"] = stream["video_url"]
4007         return [info]
4008
4009 class ZDFIE(InfoExtractor):
4010     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4011     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4012     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4013     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4014     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4015
4016     def _real_extract(self, url):
4017         mobj = re.match(self._VALID_URL, url)
4018         if mobj is None:
4019             raise ExtractorError(u'Invalid URL: %s' % url)
4020         video_id = mobj.group('video_id')
4021
4022         html = self._download_webpage(url, video_id)
4023         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4024         if streams is None:
4025             raise ExtractorError(u'No media url found.')
4026
4027         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4028         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4029         # choose first/default media type and highest quality for now
4030         for s in streams:        #find 300 - dsl1000mbit
4031             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4032                 stream_=s
4033                 break
4034         for s in streams:        #find veryhigh - dsl2000mbit
4035             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4036                 stream_=s
4037                 break
4038         if stream_ is None:
4039             raise ExtractorError(u'No stream found.')
4040
4041         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4042
4043         self.report_extraction(video_id)
4044         mobj = re.search(self._TITLE, html)
4045         if mobj is None:
4046             raise ExtractorError(u'Cannot extract title')
4047         title = unescapeHTML(mobj.group('title'))
4048
4049         mobj = re.search(self._MMS_STREAM, media_link)
4050         if mobj is None:
4051             mobj = re.search(self._RTSP_STREAM, media_link)
4052             if mobj is None:
4053                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4054         mms_url = mobj.group('video_url')
4055
4056         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4057         if mobj is None:
4058             raise ExtractorError(u'Cannot extract extention')
4059         ext = mobj.group('ext')
4060
4061         return [{'id': video_id,
4062                  'url': mms_url,
4063                  'title': title,
4064                  'ext': ext
4065                  }]
4066
4067 class TumblrIE(InfoExtractor):
4068     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4069
4070     def _real_extract(self, url):
4071         m_url = re.match(self._VALID_URL, url)
4072         video_id = m_url.group('id')
4073         blog = m_url.group('blog_name')
4074
4075         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4076         webpage = self._download_webpage(url, video_id)
4077
4078         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4079         video = re.search(re_video, webpage)
4080         if video is None:
4081            raise ExtractorError(u'Unable to extract video')
4082         video_url = video.group('video_url')
4083         ext = video.group('ext')
4084
4085         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4086             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4087         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4088
4089         # The only place where you can get a title, it's not complete,
4090         # but searching in other places doesn't work for all videos
4091         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4092             webpage, u'title', flags=re.DOTALL)
4093
4094         return [{'id': video_id,
4095                  'url': video_url,
4096                  'title': video_title,
4097                  'thumbnail': video_thumbnail,
4098                  'ext': ext
4099                  }]
4100
4101 class BandcampIE(InfoExtractor):
4102     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4103
4104     def _real_extract(self, url):
4105         mobj = re.match(self._VALID_URL, url)
4106         title = mobj.group('title')
4107         webpage = self._download_webpage(url, title)
4108         # We get the link to the free download page
4109         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4110         if m_download is None:
4111             raise ExtractorError(u'No free songs found')
4112
4113         download_link = m_download.group(1)
4114         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4115                        webpage, re.MULTILINE|re.DOTALL).group('id')
4116
4117         download_webpage = self._download_webpage(download_link, id,
4118                                                   'Downloading free downloads page')
4119         # We get the dictionary of the track from some javascrip code
4120         info = re.search(r'items: (.*?),$',
4121                          download_webpage, re.MULTILINE).group(1)
4122         info = json.loads(info)[0]
4123         # We pick mp3-320 for now, until format selection can be easily implemented.
4124         mp3_info = info[u'downloads'][u'mp3-320']
4125         # If we try to use this url it says the link has expired
4126         initial_url = mp3_info[u'url']
4127         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4128         m_url = re.match(re_url, initial_url)
4129         #We build the url we will use to get the final track url
4130         # This url is build in Bandcamp in the script download_bunde_*.js
4131         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4132         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4133         # If we could correctly generate the .rand field the url would be
4134         #in the "download_url" key
4135         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4136
4137         track_info = {'id':id,
4138                       'title' : info[u'title'],
4139                       'ext' :   'mp3',
4140                       'url' :   final_url,
4141                       'thumbnail' : info[u'thumb_url'],
4142                       'uploader' :  info[u'artist']
4143                       }
4144
4145         return [track_info]
4146
4147 class RedTubeIE(InfoExtractor):
4148     """Information Extractor for redtube"""
4149     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4150
4151     def _real_extract(self,url):
4152         mobj = re.match(self._VALID_URL, url)
4153         if mobj is None:
4154             raise ExtractorError(u'Invalid URL: %s' % url)
4155
4156         video_id = mobj.group('id')
4157         video_extension = 'mp4'
4158         webpage = self._download_webpage(url, video_id)
4159
4160         self.report_extraction(video_id)
4161
4162         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4163             webpage, u'video URL')
4164
4165         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4166             webpage, u'title')
4167
4168         return [{
4169             'id':       video_id,
4170             'url':      video_url,
4171             'ext':      video_extension,
4172             'title':    video_title,
4173         }]
4174
4175 class InaIE(InfoExtractor):
4176     """Information Extractor for Ina.fr"""
4177     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4178
4179     def _real_extract(self,url):
4180         mobj = re.match(self._VALID_URL, url)
4181
4182         video_id = mobj.group('id')
4183         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4184         video_extension = 'mp4'
4185         webpage = self._download_webpage(mrss_url, video_id)
4186
4187         self.report_extraction(video_id)
4188
4189         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4190             webpage, u'video URL')
4191
4192         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4193             webpage, u'title')
4194
4195         return [{
4196             'id':       video_id,
4197             'url':      video_url,
4198             'ext':      video_extension,
4199             'title':    video_title,
4200         }]
4201
4202 class HowcastIE(InfoExtractor):
4203     """Information Extractor for Howcast.com"""
4204     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4205
4206     def _real_extract(self, url):
4207         mobj = re.match(self._VALID_URL, url)
4208
4209         video_id = mobj.group('id')
4210         webpage_url = 'http://www.howcast.com/videos/' + video_id
4211         webpage = self._download_webpage(webpage_url, video_id)
4212
4213         self.report_extraction(video_id)
4214
4215         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4216             webpage, u'video URL')
4217
4218         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4219             webpage, u'title')
4220
4221         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4222             webpage, u'description', fatal=False)
4223
4224         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4225             webpage, u'thumbnail', fatal=False)
4226
4227         return [{
4228             'id':       video_id,
4229             'url':      video_url,
4230             'ext':      'mp4',
4231             'title':    video_title,
4232             'description': video_description,
4233             'thumbnail': thumbnail,
4234         }]
4235
4236 class VineIE(InfoExtractor):
4237     """Information Extractor for Vine.co"""
4238     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4239
4240     def _real_extract(self, url):
4241         mobj = re.match(self._VALID_URL, url)
4242
4243         video_id = mobj.group('id')
4244         webpage_url = 'https://vine.co/v/' + video_id
4245         webpage = self._download_webpage(webpage_url, video_id)
4246
4247         self.report_extraction(video_id)
4248
4249         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4250             webpage, u'video URL')
4251
4252         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4253             webpage, u'title')
4254
4255         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4256             webpage, u'thumbnail', fatal=False)
4257
4258         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4259             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4260
4261         return [{
4262             'id':        video_id,
4263             'url':       video_url,
4264             'ext':       'mp4',
4265             'title':     video_title,
4266             'thumbnail': thumbnail,
4267             'uploader':  uploader,
4268         }]
4269
4270 class FlickrIE(InfoExtractor):
4271     """Information Extractor for Flickr videos"""
4272     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4273
4274     def _real_extract(self, url):
4275         mobj = re.match(self._VALID_URL, url)
4276
4277         video_id = mobj.group('id')
4278         video_uploader_id = mobj.group('uploader_id')
4279         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4280         webpage = self._download_webpage(webpage_url, video_id)
4281
4282         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4283
4284         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4285         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4286
4287         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4288             first_xml, u'node_id')
4289
4290         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4291         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4292
4293         self.report_extraction(video_id)
4294
4295         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4296         if mobj is None:
4297             raise ExtractorError(u'Unable to extract video url')
4298         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4299
4300         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4301             webpage, u'video title')
4302
4303         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4304             webpage, u'description', fatal=False)
4305
4306         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4307             webpage, u'thumbnail', fatal=False)
4308
4309         return [{
4310             'id':          video_id,
4311             'url':         video_url,
4312             'ext':         'mp4',
4313             'title':       video_title,
4314             'description': video_description,
4315             'thumbnail':   thumbnail,
4316             'uploader_id': video_uploader_id,
4317         }]
4318
4319 class TeamcocoIE(InfoExtractor):
4320     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4321
4322     def _real_extract(self, url):
4323         mobj = re.match(self._VALID_URL, url)
4324         if mobj is None:
4325             raise ExtractorError(u'Invalid URL: %s' % url)
4326         url_title = mobj.group('url_title')
4327         webpage = self._download_webpage(url, url_title)
4328
4329         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4330             webpage, u'video id')
4331
4332         self.report_extraction(video_id)
4333
4334         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4335             webpage, u'title')
4336
4337         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4338             webpage, u'thumbnail', fatal=False)
4339
4340         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4341             webpage, u'description', fatal=False)
4342
4343         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4344         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4345
4346         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4347             data, u'video URL')
4348
4349         return [{
4350             'id':          video_id,
4351             'url':         video_url,
4352             'ext':         'mp4',
4353             'title':       video_title,
4354             'thumbnail':   thumbnail,
4355             'description': video_description,
4356         }]
4357
4358 class XHamsterIE(InfoExtractor):
4359     """Information Extractor for xHamster"""
4360     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4361
4362     def _real_extract(self,url):
4363         mobj = re.match(self._VALID_URL, url)
4364
4365         video_id = mobj.group('id')
4366         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4367         webpage = self._download_webpage(mrss_url, video_id)
4368
4369         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4370         if mobj is None:
4371             raise ExtractorError(u'Unable to extract media URL')
4372         if len(mobj.group('server')) == 0:
4373             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4374         else:
4375             video_url = mobj.group('server')+'/key='+mobj.group('file')
4376         video_extension = video_url.split('.')[-1]
4377
4378         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4379             webpage, u'title')
4380
4381         # Can't see the description anywhere in the UI
4382         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4383         #     webpage, u'description', fatal=False)
4384         # if video_description: video_description = unescapeHTML(video_description)
4385
4386         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4387         if mobj:
4388             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4389         else:
4390             video_upload_date = None
4391             self._downloader.report_warning(u'Unable to extract upload date')
4392
4393         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4394             webpage, u'uploader id', default=u'anonymous')
4395
4396         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4397             webpage, u'thumbnail', fatal=False)
4398
4399         return [{
4400             'id':       video_id,
4401             'url':      video_url,
4402             'ext':      video_extension,
4403             'title':    video_title,
4404             # 'description': video_description,
4405             'upload_date': video_upload_date,
4406             'uploader_id': video_uploader_id,
4407             'thumbnail': video_thumbnail
4408         }]
4409
4410 class HypemIE(InfoExtractor):
4411     """Information Extractor for hypem"""
4412     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4413
4414     def _real_extract(self, url):
4415         mobj = re.match(self._VALID_URL, url)
4416         if mobj is None:
4417             raise ExtractorError(u'Invalid URL: %s' % url)
4418         track_id = mobj.group(1)
4419
4420         data = { 'ax': 1, 'ts': time.time() }
4421         data_encoded = compat_urllib_parse.urlencode(data)
4422         complete_url = url + "?" + data_encoded
4423         request = compat_urllib_request.Request(complete_url)
4424         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4425         cookie = urlh.headers.get('Set-Cookie', '')
4426
4427         self.report_extraction(track_id)
4428
4429         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4430             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4431         try:
4432             track_list = json.loads(html_tracks)
4433             track = track_list[u'tracks'][0]
4434         except ValueError:
4435             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4436
4437         key = track[u"key"]
4438         track_id = track[u"id"]
4439         artist = track[u"artist"]
4440         title = track[u"song"]
4441
4442         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4443         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4444         request.add_header('cookie', cookie)
4445         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4446         try:
4447             song_data = json.loads(song_data_json)
4448         except ValueError:
4449             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4450         final_url = song_data[u"url"]
4451
4452         return [{
4453             'id':       track_id,
4454             'url':      final_url,
4455             'ext':      "mp3",
4456             'title':    title,
4457             'artist':   artist,
4458         }]
4459
4460 class Vbox7IE(InfoExtractor):
4461     """Information Extractor for Vbox7"""
4462     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4463
4464     def _real_extract(self,url):
4465         mobj = re.match(self._VALID_URL, url)
4466         if mobj is None:
4467             raise ExtractorError(u'Invalid URL: %s' % url)
4468         video_id = mobj.group(1)
4469
4470         redirect_page, urlh = self._download_webpage_handle(url, video_id)
4471         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4472         redirect_url = urlh.geturl() + new_location
4473         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4474
4475         title = self._html_search_regex(r'<title>(.*)</title>',
4476             webpage, u'title').split('/')[0].strip()
4477
4478         ext = "flv"
4479         info_url = "http://vbox7.com/play/magare.do"
4480         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4481         info_request = compat_urllib_request.Request(info_url, data)
4482         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4483         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4484         if info_response is None:
4485             raise ExtractorError(u'Unable to extract the media url')
4486         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4487
4488         return [{
4489             'id':        video_id,
4490             'url':       final_url,
4491             'ext':       ext,
4492             'title':     title,
4493             'thumbnail': thumbnail_url,
4494         }]
4495
4496 class GametrailersIE(InfoExtractor):
4497     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4498
4499     def _real_extract(self, url):
4500         mobj = re.match(self._VALID_URL, url)
4501         if mobj is None:
4502             raise ExtractorError(u'Invalid URL: %s' % url)
4503         video_id = mobj.group('id')
4504         video_type = mobj.group('type')
4505         webpage = self._download_webpage(url, video_id)
4506         if video_type == 'full-episodes':
4507             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4508         else:
4509             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4510         mgid = self._search_regex(mgid_re, webpage, u'mgid')
4511         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4512
4513         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4514                                            video_id, u'Downloading video info')
4515         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4516                                                video_id, u'Downloading video urls info')
4517
4518         self.report_extraction(video_id)
4519         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4520                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4521                       <image>.*
4522                         <url>(?P<thumb>.*?)</url>.*
4523                       </image>'''
4524
4525         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4526         if m_info is None:
4527             raise ExtractorError(u'Unable to extract video info')
4528         video_title = m_info.group('title')
4529         video_description = m_info.group('description')
4530         video_thumb = m_info.group('thumb')
4531
4532         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4533         if m_urls is None or len(m_urls) == 0:
4534             raise ExtractError(u'Unable to extrat video url')
4535         # They are sorted from worst to best quality
4536         video_url = m_urls[-1].group('url')
4537
4538         return {'url':         video_url,
4539                 'id':          video_id,
4540                 'title':       video_title,
4541                 # Videos are actually flv not mp4
4542                 'ext':         'flv',
4543                 'thumbnail':   video_thumb,
4544                 'description': video_description,
4545                 }
4546
4547 def gen_extractors():
4548     """ Return a list of an instance of every supported extractor.
4549     The order does matter; the first extractor matched is the one handling the URL.
4550     """
4551     return [
4552         YoutubePlaylistIE(),
4553         YoutubeChannelIE(),
4554         YoutubeUserIE(),
4555         YoutubeSearchIE(),
4556         YoutubeIE(),
4557         MetacafeIE(),
4558         DailymotionIE(),
4559         GoogleSearchIE(),
4560         PhotobucketIE(),
4561         YahooIE(),
4562         YahooSearchIE(),
4563         DepositFilesIE(),
4564         FacebookIE(),
4565         BlipTVIE(),
4566         BlipTVUserIE(),
4567         VimeoIE(),
4568         MyVideoIE(),
4569         ComedyCentralIE(),
4570         EscapistIE(),
4571         CollegeHumorIE(),
4572         XVideosIE(),
4573         SoundcloudSetIE(),
4574         SoundcloudIE(),
4575         InfoQIE(),
4576         MixcloudIE(),
4577         StanfordOpenClassroomIE(),
4578         MTVIE(),
4579         YoukuIE(),
4580         XNXXIE(),
4581         YouJizzIE(),
4582         PornotubeIE(),
4583         YouPornIE(),
4584         GooglePlusIE(),
4585         ArteTvIE(),
4586         NBAIE(),
4587         WorldStarHipHopIE(),
4588         JustinTVIE(),
4589         FunnyOrDieIE(),
4590         SteamIE(),
4591         UstreamIE(),
4592         RBMARadioIE(),
4593         EightTracksIE(),
4594         KeekIE(),
4595         TEDIE(),
4596         MySpassIE(),
4597         SpiegelIE(),
4598         LiveLeakIE(),
4599         ARDIE(),
4600         ZDFIE(),
4601         TumblrIE(),
4602         BandcampIE(),
4603         RedTubeIE(),
4604         InaIE(),
4605         HowcastIE(),
4606         VineIE(),
4607         FlickrIE(),
4608         TeamcocoIE(),
4609         XHamsterIE(),
4610         HypemIE(),
4611         Vbox7IE(),
4612         GametrailersIE(),
4613         GenericIE()
4614     ]
4615
4616 def get_info_extractor(ie_name):
4617     """Returns the info extractor class with the given ie_name"""
4618     return globals()[ie_name+'IE']