youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 195         """
 196         Perform a regex search on the given string, using a single or a list of
 197         patterns returning the first matching group.
 198         In case of failure return a default value or raise a WARNING or a
 199         ExtractorError, depending on fatal, specifying the field name.
 200         """
 201         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 202             mobj = re.search(pattern, string, flags)
 203         else:
 204             for p in pattern:
 205                 mobj = re.search(p, string, flags)
 206                 if mobj: break
 207
 208         if sys.stderr.isatty() and os.name != 'nt':
 209             _name = u'\033[0;34m%s\033[0m' % name
 210         else:
 211             _name = name
 212
 213         if mobj:
 214             # return the first matching group
 215             return next(g for g in mobj.groups() if g is not None)
 216         elif default is not None:
 217             return default
 218         elif fatal:
 219             raise ExtractorError(u'Unable to extract %s' % _name)
 220         else:
 221             self._downloader.report_warning(u'unable to extract %s; '
 222                 u'please report this issue on GitHub.' % _name)
 223             return None
 224
 225     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 226         """
 227         Like _search_regex, but strips HTML tags and unescapes entities.
 228         """
 229         res = self._search_regex(pattern, string, name, default, fatal, flags)
 230         if res:
 231             return clean_html(res).strip()
 232         else:
 233             return res
 234
 235 class SearchInfoExtractor(InfoExtractor):
 236     """
 237     Base class for paged search queries extractors.
 238     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 239     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 240     """
 241
 242     @classmethod
 243     def _make_valid_url(cls):
 244         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 245
 246     @classmethod
 247     def suitable(cls, url):
 248         return re.match(cls._make_valid_url(), url) is not None
 249
 250     def _real_extract(self, query):
 251         mobj = re.match(self._make_valid_url(), query)
 252         if mobj is None:
 253             raise ExtractorError(u'Invalid search query "%s"' % query)
 254
 255         prefix = mobj.group('prefix')
 256         query = mobj.group('query')
 257         if prefix == '':
 258             return self._get_n_results(query, 1)
 259         elif prefix == 'all':
 260             return self._get_n_results(query, self._MAX_RESULTS)
 261         else:
 262             n = int(prefix)
 263             if n <= 0:
 264                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 265             elif n > self._MAX_RESULTS:
 266                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 267                 n = self._MAX_RESULTS
 268             return self._get_n_results(query, n)
 269
 270     def _get_n_results(self, query, n):
 271         """Get a specified number of results for a query"""
 272         raise NotImplementedError("This method must be implemented by sublclasses")
 273
 274
 275 class YoutubeIE(InfoExtractor):
 276     """Information extractor for youtube.com."""
 277
 278     _VALID_URL = r"""^
 279                      (
 280                          (?:https?://)?                                       # http(s):// (optional)
 281                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 282                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 283                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 284                          (?:                                                  # the various things that can precede the ID:
 285                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 286                              |(?:                                             # or the v= param in all its forms
 287                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 288                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 289                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 290                                  v=
 291                              )
 292                          )?                                                   # optional -> youtube.com/xxxx is OK
 293                      )?                                                       # all until now is optional -> you can pass the naked ID
 294                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 295                      (?(1).+)?                                                # if we found the ID, everything can follow
 296                      $"""
 297     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 298     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 299     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 300     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 301     _NETRC_MACHINE = 'youtube'
 302     # Listed in order of quality
 303     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 304     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 305     _video_extensions = {
 306         '13': '3gp',
 307         '17': 'mp4',
 308         '18': 'mp4',
 309         '22': 'mp4',
 310         '37': 'mp4',
 311         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 312         '43': 'webm',
 313         '44': 'webm',
 314         '45': 'webm',
 315         '46': 'webm',
 316     }
 317     _video_dimensions = {
 318         '5': '240x400',
 319         '6': '???',
 320         '13': '???',
 321         '17': '144x176',
 322         '18': '360x640',
 323         '22': '720x1280',
 324         '34': '360x640',
 325         '35': '480x854',
 326         '37': '1080x1920',
 327         '38': '3072x4096',
 328         '43': '360x640',
 329         '44': '480x854',
 330         '45': '720x1280',
 331         '46': '1080x1920',
 332     }
 333     IE_NAME = u'youtube'
 334
 335     @classmethod
 336     def suitable(cls, url):
 337         """Receives a URL and returns True if suitable for this IE."""
 338         if YoutubePlaylistIE.suitable(url): return False
 339         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 340
 341     def report_lang(self):
 342         """Report attempt to set language."""
 343         self.to_screen(u'Setting language')
 344
 345     def report_login(self):
 346         """Report attempt to log in."""
 347         self.to_screen(u'Logging in')
 348
 349     def report_video_webpage_download(self, video_id):
 350         """Report attempt to download video webpage."""
 351         self.to_screen(u'%s: Downloading video webpage' % video_id)
 352
 353     def report_video_info_webpage_download(self, video_id):
 354         """Report attempt to download video info webpage."""
 355         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 356
 357     def report_video_subtitles_download(self, video_id):
 358         """Report attempt to download video info webpage."""
 359         self.to_screen(u'%s: Checking available subtitles' % video_id)
 360
 361     def report_video_subtitles_request(self, video_id, sub_lang, format):
 362         """Report attempt to download video info webpage."""
 363         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 364
 365     def report_video_subtitles_available(self, video_id, sub_lang_list):
 366         """Report available subtitles."""
 367         sub_lang = ",".join(list(sub_lang_list.keys()))
 368         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 369
 370     def report_information_extraction(self, video_id):
 371         """Report attempt to extract video information."""
 372         self.to_screen(u'%s: Extracting video information' % video_id)
 373
 374     def report_unavailable_format(self, video_id, format):
 375         """Report extracted video URL."""
 376         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 377
 378     def report_rtmp_download(self):
 379         """Indicate the download will use the RTMP protocol."""
 380         self.to_screen(u'RTMP download detected')
 381
 382     def _get_available_subtitles(self, video_id):
 383         self.report_video_subtitles_download(video_id)
 384         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 385         try:
 386             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 388             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 389         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 390         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 391         if not sub_lang_list:
 392             return (u'video doesn\'t have subtitles', None)
 393         return sub_lang_list
 394
 395     def _list_available_subtitles(self, video_id):
 396         sub_lang_list = self._get_available_subtitles(video_id)
 397         self.report_video_subtitles_available(video_id, sub_lang_list)
 398
 399     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 400         """
 401         Return tuple:
 402         (error_message, sub_lang, sub)
 403         """
 404         self.report_video_subtitles_request(video_id, sub_lang, format)
 405         params = compat_urllib_parse.urlencode({
 406             'lang': sub_lang,
 407             'name': sub_name,
 408             'v': video_id,
 409             'fmt': format,
 410         })
 411         url = 'http://www.youtube.com/api/timedtext?' + params
 412         try:
 413             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 416         if not sub:
 417             return (u'Did not fetch video subtitles', None, None)
 418         return (None, sub_lang, sub)
 419
 420     def _request_automatic_caption(self, video_id, webpage):
 421         """We need the webpage for getting the captions url, pass it as an
 422            argument to speed up the process."""
 423         sub_lang = self._downloader.params.get('subtitleslang')
 424         sub_format = self._downloader.params.get('subtitlesformat')
 425         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 426         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 427         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 428         if mobj is None:
 429             return [(err_msg, None, None)]
 430         player_config = json.loads(mobj.group(1))
 431         try:
 432             args = player_config[u'args']
 433             caption_url = args[u'ttsurl']
 434             timestamp = args[u'timestamp']
 435             params = compat_urllib_parse.urlencode({
 436                 'lang': 'en',
 437                 'tlang': sub_lang,
 438                 'fmt': sub_format,
 439                 'ts': timestamp,
 440                 'kind': 'asr',
 441             })
 442             subtitles_url = caption_url + '&' + params
 443             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 444             return [(None, sub_lang, sub)]
 445         except KeyError:
 446             return [(err_msg, None, None)]
 447
 448     def _extract_subtitle(self, video_id):
 449         """
 450         Return a list with a tuple:
 451         [(error_message, sub_lang, sub)]
 452         """
 453         sub_lang_list = self._get_available_subtitles(video_id)
 454         sub_format = self._downloader.params.get('subtitlesformat')
 455         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 456             return [(sub_lang_list[0], None, None)]
 457         if self._downloader.params.get('subtitleslang', False):
 458             sub_lang = self._downloader.params.get('subtitleslang')
 459         elif 'en' in sub_lang_list:
 460             sub_lang = 'en'
 461         else:
 462             sub_lang = list(sub_lang_list.keys())[0]
 463         if not sub_lang in sub_lang_list:
 464             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 465
 466         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 467         return [subtitle]
 468
 469     def _extract_all_subtitles(self, video_id):
 470         sub_lang_list = self._get_available_subtitles(video_id)
 471         sub_format = self._downloader.params.get('subtitlesformat')
 472         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 473             return [(sub_lang_list[0], None, None)]
 474         subtitles = []
 475         for sub_lang in sub_lang_list:
 476             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 477             subtitles.append(subtitle)
 478         return subtitles
 479
 480     def _print_formats(self, formats):
 481         print('Available formats:')
 482         for x in formats:
 483             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 484
 485     def _real_initialize(self):
 486         if self._downloader is None:
 487             return
 488
 489         username = None
 490         password = None
 491         downloader_params = self._downloader.params
 492
 493         # Attempt to use provided username and password or .netrc data
 494         if downloader_params.get('username', None) is not None:
 495             username = downloader_params['username']
 496             password = downloader_params['password']
 497         elif downloader_params.get('usenetrc', False):
 498             try:
 499                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 500                 if info is not None:
 501                     username = info[0]
 502                     password = info[2]
 503                 else:
 504                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 505             except (IOError, netrc.NetrcParseError) as err:
 506                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 507                 return
 508
 509         # Set language
 510         request = compat_urllib_request.Request(self._LANG_URL)
 511         try:
 512             self.report_lang()
 513             compat_urllib_request.urlopen(request).read()
 514         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 515             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 516             return
 517
 518         # No authentication to be performed
 519         if username is None:
 520             return
 521
 522         request = compat_urllib_request.Request(self._LOGIN_URL)
 523         try:
 524             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 525         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 526             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 527             return
 528
 529         galx = None
 530         dsh = None
 531         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 532         if match:
 533           galx = match.group(1)
 534
 535         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 536         if match:
 537           dsh = match.group(1)
 538
 539         # Log in
 540         login_form_strs = {
 541                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 542                 u'Email': username,
 543                 u'GALX': galx,
 544                 u'Passwd': password,
 545                 u'PersistentCookie': u'yes',
 546                 u'_utf8': u'霱',
 547                 u'bgresponse': u'js_disabled',
 548                 u'checkConnection': u'',
 549                 u'checkedDomains': u'youtube',
 550                 u'dnConn': u'',
 551                 u'dsh': dsh,
 552                 u'pstMsg': u'0',
 553                 u'rmShown': u'1',
 554                 u'secTok': u'',
 555                 u'signIn': u'Sign in',
 556                 u'timeStmp': u'',
 557                 u'service': u'youtube',
 558                 u'uilel': u'3',
 559                 u'hl': u'en_US',
 560         }
 561         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 562         # chokes on unicode
 563         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 564         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 565         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 566         try:
 567             self.report_login()
 568             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 569             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 570                 self._downloader.report_warning(u'unable to log in: bad username or password')
 571                 return
 572         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 573             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 574             return
 575
 576         # Confirm age
 577         age_form = {
 578                 'next_url':     '/',
 579                 'action_confirm':   'Confirm',
 580                 }
 581         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 582         try:
 583             self.report_age_confirmation()
 584             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 585         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 586             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 587
 588     def _extract_id(self, url):
 589         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 590         if mobj is None:
 591             raise ExtractorError(u'Invalid URL: %s' % url)
 592         video_id = mobj.group(2)
 593         return video_id
 594
 595     def _real_extract(self, url):
 596         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 597         mobj = re.search(self._NEXT_URL_RE, url)
 598         if mobj:
 599             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 600         video_id = self._extract_id(url)
 601
 602         # Get video webpage
 603         self.report_video_webpage_download(video_id)
 604         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 605         request = compat_urllib_request.Request(url)
 606         try:
 607             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 608         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 609             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 610
 611         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 612
 613         # Attempt to extract SWF player URL
 614         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 615         if mobj is not None:
 616             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 617         else:
 618             player_url = None
 619
 620         # Get video info
 621         self.report_video_info_webpage_download(video_id)
 622         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 623             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 624                     % (video_id, el_type))
 625             video_info_webpage = self._download_webpage(video_info_url, video_id,
 626                                     note=False,
 627                                     errnote='unable to download video info webpage')
 628             video_info = compat_parse_qs(video_info_webpage)
 629             if 'token' in video_info:
 630                 break
 631         if 'token' not in video_info:
 632             if 'reason' in video_info:
 633                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 634             else:
 635                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 636
 637         # Check for "rental" videos
 638         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 639             raise ExtractorError(u'"rental" videos not supported')
 640
 641         # Start extracting information
 642         self.report_information_extraction(video_id)
 643
 644         # uploader
 645         if 'author' not in video_info:
 646             raise ExtractorError(u'Unable to extract uploader name')
 647         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 648
 649         # uploader_id
 650         video_uploader_id = None
 651         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 652         if mobj is not None:
 653             video_uploader_id = mobj.group(1)
 654         else:
 655             self._downloader.report_warning(u'unable to extract uploader nickname')
 656
 657         # title
 658         if 'title' not in video_info:
 659             raise ExtractorError(u'Unable to extract video title')
 660         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 661
 662         # thumbnail image
 663         if 'thumbnail_url' not in video_info:
 664             self._downloader.report_warning(u'unable to extract video thumbnail')
 665             video_thumbnail = ''
 666         else:   # don't panic if we can't find it
 667             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 668
 669         # upload date
 670         upload_date = None
 671         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 672         if mobj is not None:
 673             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 674             upload_date = unified_strdate(upload_date)
 675
 676         # description
 677         video_description = get_element_by_id("eow-description", video_webpage)
 678         if video_description:
 679             video_description = clean_html(video_description)
 680         else:
 681             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 682             if fd_mobj:
 683                 video_description = unescapeHTML(fd_mobj.group(1))
 684             else:
 685                 video_description = u''
 686
 687         # subtitles
 688         video_subtitles = None
 689
 690         if self._downloader.params.get('writesubtitles', False):
 691             video_subtitles = self._extract_subtitle(video_id)
 692             if video_subtitles:
 693                 (sub_error, sub_lang, sub) = video_subtitles[0]
 694                 if sub_error:
 695                     # We try with the automatic captions
 696                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 697                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 698                     if sub is not None:
 699                         pass
 700                     else:
 701                         # We report the original error
 702                         self._downloader.report_error(sub_error)
 703
 704         if self._downloader.params.get('allsubtitles', False):
 705             video_subtitles = self._extract_all_subtitles(video_id)
 706             for video_subtitle in video_subtitles:
 707                 (sub_error, sub_lang, sub) = video_subtitle
 708                 if sub_error:
 709                     self._downloader.report_error(sub_error)
 710
 711         if self._downloader.params.get('listsubtitles', False):
 712             sub_lang_list = self._list_available_subtitles(video_id)
 713             return
 714
 715         if 'length_seconds' not in video_info:
 716             self._downloader.report_warning(u'unable to extract video duration')
 717             video_duration = ''
 718         else:
 719             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 720
 721         # token
 722         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 723
 724         # Decide which formats to download
 725         req_format = self._downloader.params.get('format', None)
 726
 727         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 728             self.report_rtmp_download()
 729             video_url_list = [(None, video_info['conn'][0])]
 730         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 731             url_map = {}
 732             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 733                 url_data = compat_parse_qs(url_data_str)
 734                 if 'itag' in url_data and 'url' in url_data:
 735                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 736                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 737                     url_map[url_data['itag'][0]] = url
 738
 739             format_limit = self._downloader.params.get('format_limit', None)
 740             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 741             if format_limit is not None and format_limit in available_formats:
 742                 format_list = available_formats[available_formats.index(format_limit):]
 743             else:
 744                 format_list = available_formats
 745             existing_formats = [x for x in format_list if x in url_map]
 746             if len(existing_formats) == 0:
 747                 raise ExtractorError(u'no known formats available for video')
 748             if self._downloader.params.get('listformats', None):
 749                 self._print_formats(existing_formats)
 750                 return
 751             if req_format is None or req_format == 'best':
 752                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 753             elif req_format == 'worst':
 754                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 755             elif req_format in ('-1', 'all'):
 756                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 757             else:
 758                 # Specific formats. We pick the first in a slash-delimeted sequence.
 759                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 760                 req_formats = req_format.split('/')
 761                 video_url_list = None
 762                 for rf in req_formats:
 763                     if rf in url_map:
 764                         video_url_list = [(rf, url_map[rf])]
 765                         break
 766                 if video_url_list is None:
 767                     raise ExtractorError(u'requested format not available')
 768         else:
 769             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 770
 771         results = []
 772         for format_param, video_real_url in video_url_list:
 773             # Extension
 774             video_extension = self._video_extensions.get(format_param, 'flv')
 775
 776             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 777                                               self._video_dimensions.get(format_param, '???'))
 778
 779             results.append({
 780                 'id':       video_id,
 781                 'url':      video_real_url,
 782                 'uploader': video_uploader,
 783                 'uploader_id': video_uploader_id,
 784                 'upload_date':  upload_date,
 785                 'title':    video_title,
 786                 'ext':      video_extension,
 787                 'format':   video_format,
 788                 'thumbnail':    video_thumbnail,
 789                 'description':  video_description,
 790                 'player_url':   player_url,
 791                 'subtitles':    video_subtitles,
 792                 'duration':     video_duration
 793             })
 794         return results
 795
 796
 797 class MetacafeIE(InfoExtractor):
 798     """Information Extractor for metacafe.com."""
 799
 800     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 801     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 802     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 803     IE_NAME = u'metacafe'
 804
 805     def report_disclaimer(self):
 806         """Report disclaimer retrieval."""
 807         self.to_screen(u'Retrieving disclaimer')
 808
 809     def _real_initialize(self):
 810         # Retrieve disclaimer
 811         request = compat_urllib_request.Request(self._DISCLAIMER)
 812         try:
 813             self.report_disclaimer()
 814             disclaimer = compat_urllib_request.urlopen(request).read()
 815         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 816             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 817
 818         # Confirm age
 819         disclaimer_form = {
 820             'filters': '0',
 821             'submit': "Continue - I'm over 18",
 822             }
 823         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 824         try:
 825             self.report_age_confirmation()
 826             disclaimer = compat_urllib_request.urlopen(request).read()
 827         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 828             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 829
 830     def _real_extract(self, url):
 831         # Extract id and simplified title from URL
 832         mobj = re.match(self._VALID_URL, url)
 833         if mobj is None:
 834             raise ExtractorError(u'Invalid URL: %s' % url)
 835
 836         video_id = mobj.group(1)
 837
 838         # Check if video comes from YouTube
 839         mobj2 = re.match(r'^yt-(.*)$', video_id)
 840         if mobj2 is not None:
 841             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 842
 843         # Retrieve video webpage to extract further information
 844         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 845
 846         # Extract URL, uploader and title from webpage
 847         self.report_extraction(video_id)
 848         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 849         if mobj is not None:
 850             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 851             video_extension = mediaURL[-3:]
 852
 853             # Extract gdaKey if available
 854             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 855             if mobj is None:
 856                 video_url = mediaURL
 857             else:
 858                 gdaKey = mobj.group(1)
 859                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 860         else:
 861             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 862             if mobj is None:
 863                 raise ExtractorError(u'Unable to extract media URL')
 864             vardict = compat_parse_qs(mobj.group(1))
 865             if 'mediaData' not in vardict:
 866                 raise ExtractorError(u'Unable to extract media URL')
 867             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 868             if mobj is None:
 869                 raise ExtractorError(u'Unable to extract media URL')
 870             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 871             video_extension = mediaURL[-3:]
 872             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 873
 874         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 875         if mobj is None:
 876             raise ExtractorError(u'Unable to extract title')
 877         video_title = mobj.group(1).decode('utf-8')
 878
 879         mobj = re.search(r'submitter=(.*?);', webpage)
 880         if mobj is None:
 881             raise ExtractorError(u'Unable to extract uploader nickname')
 882         video_uploader = mobj.group(1)
 883
 884         return [{
 885             'id':       video_id.decode('utf-8'),
 886             'url':      video_url.decode('utf-8'),
 887             'uploader': video_uploader.decode('utf-8'),
 888             'upload_date':  None,
 889             'title':    video_title,
 890             'ext':      video_extension.decode('utf-8'),
 891         }]
 892
 893 class DailymotionIE(InfoExtractor):
 894     """Information Extractor for Dailymotion"""
 895
 896     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 897     IE_NAME = u'dailymotion'
 898
 899     def _real_extract(self, url):
 900         # Extract id and simplified title from URL
 901         mobj = re.match(self._VALID_URL, url)
 902         if mobj is None:
 903             raise ExtractorError(u'Invalid URL: %s' % url)
 904
 905         video_id = mobj.group(1).split('_')[0].split('?')[0]
 906
 907         video_extension = 'mp4'
 908
 909         # Retrieve video webpage to extract further information
 910         request = compat_urllib_request.Request(url)
 911         request.add_header('Cookie', 'family_filter=off')
 912         webpage = self._download_webpage(request, video_id)
 913
 914         # Extract URL, uploader and title from webpage
 915         self.report_extraction(video_id)
 916         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 917         if mobj is None:
 918             raise ExtractorError(u'Unable to extract media URL')
 919         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 920
 921         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 922             if key in flashvars:
 923                 max_quality = key
 924                 self.to_screen(u'Using %s' % key)
 925                 break
 926         else:
 927             raise ExtractorError(u'Unable to extract video URL')
 928
 929         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 930         if mobj is None:
 931             raise ExtractorError(u'Unable to extract video URL')
 932
 933         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 934
 935         # TODO: support choosing qualities
 936
 937         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 938         if mobj is None:
 939             raise ExtractorError(u'Unable to extract title')
 940         video_title = unescapeHTML(mobj.group('title'))
 941
 942         video_uploader = None
 943         video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
 944                                              # Looking for official user
 945                                              r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
 946                                             webpage, 'video uploader')
 947
 948         video_upload_date = None
 949         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 950         if mobj is not None:
 951             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 952
 953         return [{
 954             'id':       video_id,
 955             'url':      video_url,
 956             'uploader': video_uploader,
 957             'upload_date':  video_upload_date,
 958             'title':    video_title,
 959             'ext':      video_extension,
 960         }]
 961
 962
 963 class PhotobucketIE(InfoExtractor):
 964     """Information extractor for photobucket.com."""
 965
 966     # TODO: the original _VALID_URL was:
 967     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 968     # Check if it's necessary to keep the old extracion process
 969     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 970     IE_NAME = u'photobucket'
 971
 972     def _real_extract(self, url):
 973         # Extract id from URL
 974         mobj = re.match(self._VALID_URL, url)
 975         if mobj is None:
 976             raise ExtractorError(u'Invalid URL: %s' % url)
 977
 978         video_id = mobj.group('id')
 979
 980         video_extension = mobj.group('ext')
 981
 982         # Retrieve video webpage to extract further information
 983         webpage = self._download_webpage(url, video_id)
 984
 985         # Extract URL, uploader, and title from webpage
 986         self.report_extraction(video_id)
 987         # We try first by looking the javascript code:
 988         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 989         if mobj is not None:
 990             info = json.loads(mobj.group('json'))
 991             return [{
 992                 'id':       video_id,
 993                 'url':      info[u'downloadUrl'],
 994                 'uploader': info[u'username'],
 995                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 996                 'title':    info[u'title'],
 997                 'ext':      video_extension,
 998                 'thumbnail': info[u'thumbUrl'],
 999             }]
1000
1001         # We try looking in other parts of the webpage
1002         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1003             webpage, u'video URL')
1004
1005         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1006         if mobj is None:
1007             raise ExtractorError(u'Unable to extract title')
1008         video_title = mobj.group(1).decode('utf-8')
1009         video_uploader = mobj.group(2).decode('utf-8')
1010
1011         return [{
1012             'id':       video_id.decode('utf-8'),
1013             'url':      video_url.decode('utf-8'),
1014             'uploader': video_uploader,
1015             'upload_date':  None,
1016             'title':    video_title,
1017             'ext':      video_extension.decode('utf-8'),
1018         }]
1019
1020
1021 class YahooIE(InfoExtractor):
1022     """Information extractor for screen.yahoo.com."""
1023     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1024
1025     def _real_extract(self, url):
1026         mobj = re.match(self._VALID_URL, url)
1027         if mobj is None:
1028             raise ExtractorError(u'Invalid URL: %s' % url)
1029         video_id = mobj.group('id')
1030         webpage = self._download_webpage(url, video_id)
1031         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1032
1033         if m_id is None:
1034             # TODO: Check which url parameters are required
1035             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1036             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1037             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1038                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1039                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1040                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1041                         '''
1042             self.report_extraction(video_id)
1043             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1044             if m_info is None:
1045                 raise ExtractorError(u'Unable to extract video info')
1046             video_title = m_info.group('title')
1047             video_description = m_info.group('description')
1048             video_thumb = m_info.group('thumb')
1049             video_date = m_info.group('date')
1050             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1051
1052             # TODO: Find a way to get mp4 videos
1053             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1054             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1055             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1056             video_url = m_rest.group('url')
1057             video_path = m_rest.group('path')
1058             if m_rest is None:
1059                 raise ExtractorError(u'Unable to extract video url')
1060
1061         else: # We have to use a different method if another id is defined
1062             long_id = m_id.group('new_id')
1063             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1064             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1065             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1066             info = json.loads(json_str)
1067             res = info[u'query'][u'results'][u'mediaObj'][0]
1068             stream = res[u'streams'][0]
1069             video_path = stream[u'path']
1070             video_url = stream[u'host']
1071             meta = res[u'meta']
1072             video_title = meta[u'title']
1073             video_description = meta[u'description']
1074             video_thumb = meta[u'thumbnail']
1075             video_date = None # I can't find it
1076
1077         info_dict = {
1078                      'id': video_id,
1079                      'url': video_url,
1080                      'play_path': video_path,
1081                      'title':video_title,
1082                      'description': video_description,
1083                      'thumbnail': video_thumb,
1084                      'upload_date': video_date,
1085                      'ext': 'flv',
1086                      }
1087         return info_dict
1088
1089 class VimeoIE(InfoExtractor):
1090     """Information extractor for vimeo.com."""
1091
1092     # _VALID_URL matches Vimeo URLs
1093     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1094     IE_NAME = u'vimeo'
1095
1096     def _real_extract(self, url, new_video=True):
1097         # Extract ID from URL
1098         mobj = re.match(self._VALID_URL, url)
1099         if mobj is None:
1100             raise ExtractorError(u'Invalid URL: %s' % url)
1101
1102         video_id = mobj.group('id')
1103         if not mobj.group('proto'):
1104             url = 'https://' + url
1105         if mobj.group('direct_link') or mobj.group('pro'):
1106             url = 'https://vimeo.com/' + video_id
1107
1108         # Retrieve video webpage to extract further information
1109         request = compat_urllib_request.Request(url, None, std_headers)
1110         webpage = self._download_webpage(request, video_id)
1111
1112         # Now we begin extracting as much information as we can from what we
1113         # retrieved. First we extract the information common to all extractors,
1114         # and latter we extract those that are Vimeo specific.
1115         self.report_extraction(video_id)
1116
1117         # Extract the config JSON
1118         try:
1119             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1120             config = json.loads(config)
1121         except:
1122             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1123                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1124             else:
1125                 raise ExtractorError(u'Unable to extract info section')
1126
1127         # Extract title
1128         video_title = config["video"]["title"]
1129
1130         # Extract uploader and uploader_id
1131         video_uploader = config["video"]["owner"]["name"]
1132         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1133
1134         # Extract video thumbnail
1135         video_thumbnail = config["video"]["thumbnail"]
1136
1137         # Extract video description
1138         video_description = get_element_by_attribute("itemprop", "description", webpage)
1139         if video_description: video_description = clean_html(video_description)
1140         else: video_description = u''
1141
1142         # Extract upload date
1143         video_upload_date = None
1144         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1145         if mobj is not None:
1146             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1147
1148         # Vimeo specific: extract request signature and timestamp
1149         sig = config['request']['signature']
1150         timestamp = config['request']['timestamp']
1151
1152         # Vimeo specific: extract video codec and quality information
1153         # First consider quality, then codecs, then take everything
1154         # TODO bind to format param
1155         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1156         files = { 'hd': [], 'sd': [], 'other': []}
1157         for codec_name, codec_extension in codecs:
1158             if codec_name in config["video"]["files"]:
1159                 if 'hd' in config["video"]["files"][codec_name]:
1160                     files['hd'].append((codec_name, codec_extension, 'hd'))
1161                 elif 'sd' in config["video"]["files"][codec_name]:
1162                     files['sd'].append((codec_name, codec_extension, 'sd'))
1163                 else:
1164                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1165
1166         for quality in ('hd', 'sd', 'other'):
1167             if len(files[quality]) > 0:
1168                 video_quality = files[quality][0][2]
1169                 video_codec = files[quality][0][0]
1170                 video_extension = files[quality][0][1]
1171                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1172                 break
1173         else:
1174             raise ExtractorError(u'No known codec found')
1175
1176         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1177                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1178
1179         return [{
1180             'id':       video_id,
1181             'url':      video_url,
1182             'uploader': video_uploader,
1183             'uploader_id': video_uploader_id,
1184             'upload_date':  video_upload_date,
1185             'title':    video_title,
1186             'ext':      video_extension,
1187             'thumbnail':    video_thumbnail,
1188             'description':  video_description,
1189         }]
1190
1191
1192 class ArteTvIE(InfoExtractor):
1193     """arte.tv information extractor."""
1194
1195     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1196     _LIVE_URL = r'index-[0-9]+\.html$'
1197
1198     IE_NAME = u'arte.tv'
1199
1200     def fetch_webpage(self, url):
1201         request = compat_urllib_request.Request(url)
1202         try:
1203             self.report_download_webpage(url)
1204             webpage = compat_urllib_request.urlopen(request).read()
1205         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1206             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1207         except ValueError as err:
1208             raise ExtractorError(u'Invalid URL: %s' % url)
1209         return webpage
1210
1211     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1212         page = self.fetch_webpage(url)
1213         mobj = re.search(regex, page, regexFlags)
1214         info = {}
1215
1216         if mobj is None:
1217             raise ExtractorError(u'Invalid URL: %s' % url)
1218
1219         for (i, key, err) in matchTuples:
1220             if mobj.group(i) is None:
1221                 raise ExtractorError(err)
1222             else:
1223                 info[key] = mobj.group(i)
1224
1225         return info
1226
1227     def extractLiveStream(self, url):
1228         video_lang = url.split('/')[-4]
1229         info = self.grep_webpage(
1230             url,
1231             r'src="(.*?/videothek_js.*?\.js)',
1232             0,
1233             [
1234                 (1, 'url', u'Invalid URL: %s' % url)
1235             ]
1236         )
1237         http_host = url.split('/')[2]
1238         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1239         info = self.grep_webpage(
1240             next_url,
1241             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1242                 '(http://.*?\.swf).*?' +
1243                 '(rtmp://.*?)\'',
1244             re.DOTALL,
1245             [
1246                 (1, 'path',   u'could not extract video path: %s' % url),
1247                 (2, 'player', u'could not extract video player: %s' % url),
1248                 (3, 'url',    u'could not extract video url: %s' % url)
1249             ]
1250         )
1251         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1252
1253     def extractPlus7Stream(self, url):
1254         video_lang = url.split('/')[-3]
1255         info = self.grep_webpage(
1256             url,
1257             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1258             0,
1259             [
1260                 (1, 'url', u'Invalid URL: %s' % url)
1261             ]
1262         )
1263         next_url = compat_urllib_parse.unquote(info.get('url'))
1264         info = self.grep_webpage(
1265             next_url,
1266             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1267             0,
1268             [
1269                 (1, 'url', u'Could not find <video> tag: %s' % url)
1270             ]
1271         )
1272         next_url = compat_urllib_parse.unquote(info.get('url'))
1273
1274         info = self.grep_webpage(
1275             next_url,
1276             r'<video id="(.*?)".*?>.*?' +
1277                 '<name>(.*?)</name>.*?' +
1278                 '<dateVideo>(.*?)</dateVideo>.*?' +
1279                 '<url quality="hd">(.*?)</url>',
1280             re.DOTALL,
1281             [
1282                 (1, 'id',    u'could not extract video id: %s' % url),
1283                 (2, 'title', u'could not extract video title: %s' % url),
1284                 (3, 'date',  u'could not extract video date: %s' % url),
1285                 (4, 'url',   u'could not extract video url: %s' % url)
1286             ]
1287         )
1288
1289         return {
1290             'id':           info.get('id'),
1291             'url':          compat_urllib_parse.unquote(info.get('url')),
1292             'uploader':     u'arte.tv',
1293             'upload_date':  unified_strdate(info.get('date')),
1294             'title':        info.get('title').decode('utf-8'),
1295             'ext':          u'mp4',
1296             'format':       u'NA',
1297             'player_url':   None,
1298         }
1299
1300     def _real_extract(self, url):
1301         video_id = url.split('/')[-1]
1302         self.report_extraction(video_id)
1303
1304         if re.search(self._LIVE_URL, video_id) is not None:
1305             self.extractLiveStream(url)
1306             return
1307         else:
1308             info = self.extractPlus7Stream(url)
1309
1310         return [info]
1311
1312
1313 class GenericIE(InfoExtractor):
1314     """Generic last-resort information extractor."""
1315
1316     _VALID_URL = r'.*'
1317     IE_NAME = u'generic'
1318
1319     def report_download_webpage(self, video_id):
1320         """Report webpage download."""
1321         if not self._downloader.params.get('test', False):
1322             self._downloader.report_warning(u'Falling back on generic information extractor.')
1323         super(GenericIE, self).report_download_webpage(video_id)
1324
1325     def report_following_redirect(self, new_url):
1326         """Report information extraction."""
1327         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1328
1329     def _test_redirect(self, url):
1330         """Check if it is a redirect, like url shorteners, in case return the new url."""
1331         class HeadRequest(compat_urllib_request.Request):
1332             def get_method(self):
1333                 return "HEAD"
1334
1335         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1336             """
1337             Subclass the HTTPRedirectHandler to make it use our
1338             HeadRequest also on the redirected URL
1339             """
1340             def redirect_request(self, req, fp, code, msg, headers, newurl):
1341                 if code in (301, 302, 303, 307):
1342                     newurl = newurl.replace(' ', '%20')
1343                     newheaders = dict((k,v) for k,v in req.headers.items()
1344                                       if k.lower() not in ("content-length", "content-type"))
1345                     return HeadRequest(newurl,
1346                                        headers=newheaders,
1347                                        origin_req_host=req.get_origin_req_host(),
1348                                        unverifiable=True)
1349                 else:
1350                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1351
1352         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1353             """
1354             Fallback to GET if HEAD is not allowed (405 HTTP error)
1355             """
1356             def http_error_405(self, req, fp, code, msg, headers):
1357                 fp.read()
1358                 fp.close()
1359
1360                 newheaders = dict((k,v) for k,v in req.headers.items()
1361                                   if k.lower() not in ("content-length", "content-type"))
1362                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1363                                                  headers=newheaders,
1364                                                  origin_req_host=req.get_origin_req_host(),
1365                                                  unverifiable=True))
1366
1367         # Build our opener
1368         opener = compat_urllib_request.OpenerDirector()
1369         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1370                         HTTPMethodFallback, HEADRedirectHandler,
1371                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1372             opener.add_handler(handler())
1373
1374         response = opener.open(HeadRequest(url))
1375         if response is None:
1376             raise ExtractorError(u'Invalid URL protocol')
1377         new_url = response.geturl()
1378
1379         if url == new_url:
1380             return False
1381
1382         self.report_following_redirect(new_url)
1383         return new_url
1384
1385     def _real_extract(self, url):
1386         new_url = self._test_redirect(url)
1387         if new_url: return [self.url_result(new_url)]
1388
1389         video_id = url.split('/')[-1]
1390         try:
1391             webpage = self._download_webpage(url, video_id)
1392         except ValueError as err:
1393             # since this is the last-resort InfoExtractor, if
1394             # this error is thrown, it'll be thrown here
1395             raise ExtractorError(u'Invalid URL: %s' % url)
1396
1397         self.report_extraction(video_id)
1398         # Start with something easy: JW Player in SWFObject
1399         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1400         if mobj is None:
1401             # Broaden the search a little bit
1402             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1403         if mobj is None:
1404             # Broaden the search a little bit: JWPlayer JS loader
1405             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1406         if mobj is None:
1407             # Try to find twitter cards info
1408             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1409         if mobj is None:
1410             raise ExtractorError(u'Invalid URL: %s' % url)
1411
1412         # It's possible that one of the regexes
1413         # matched, but returned an empty group:
1414         if mobj.group(1) is None:
1415             raise ExtractorError(u'Invalid URL: %s' % url)
1416
1417         video_url = compat_urllib_parse.unquote(mobj.group(1))
1418         video_id = os.path.basename(video_url)
1419
1420         # here's a fun little line of code for you:
1421         video_extension = os.path.splitext(video_id)[1][1:]
1422         video_id = os.path.splitext(video_id)[0]
1423
1424         # it's tempting to parse this further, but you would
1425         # have to take into account all the variations like
1426         #   Video Title - Site Name
1427         #   Site Name | Video Title
1428         #   Video Title - Tagline | Site Name
1429         # and so on and so forth; it's just not practical
1430         video_title = self._html_search_regex(r'<title>(.*)</title>',
1431             webpage, u'video title')
1432
1433         # video uploader is domain name
1434         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1435             url, u'video uploader')
1436
1437         return [{
1438             'id':       video_id,
1439             'url':      video_url,
1440             'uploader': video_uploader,
1441             'upload_date':  None,
1442             'title':    video_title,
1443             'ext':      video_extension,
1444         }]
1445
1446
1447 class YoutubeSearchIE(SearchInfoExtractor):
1448     """Information Extractor for YouTube search queries."""
1449     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1450     _MAX_RESULTS = 1000
1451     IE_NAME = u'youtube:search'
1452     _SEARCH_KEY = 'ytsearch'
1453
1454     def report_download_page(self, query, pagenum):
1455         """Report attempt to download search page with given number."""
1456         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1457
1458     def _get_n_results(self, query, n):
1459         """Get a specified number of results for a query"""
1460
1461         video_ids = []
1462         pagenum = 0
1463         limit = n
1464
1465         while (50 * pagenum) < limit:
1466             self.report_download_page(query, pagenum+1)
1467             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1468             request = compat_urllib_request.Request(result_url)
1469             try:
1470                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1471             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1472                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1473             api_response = json.loads(data)['data']
1474
1475             if not 'items' in api_response:
1476                 raise ExtractorError(u'[youtube] No video results')
1477
1478             new_ids = list(video['id'] for video in api_response['items'])
1479             video_ids += new_ids
1480
1481             limit = min(n, api_response['totalItems'])
1482             pagenum += 1
1483
1484         if len(video_ids) > n:
1485             video_ids = video_ids[:n]
1486         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1487         return self.playlist_result(videos, query)
1488
1489
1490 class GoogleSearchIE(SearchInfoExtractor):
1491     """Information Extractor for Google Video search queries."""
1492     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1493     _MAX_RESULTS = 1000
1494     IE_NAME = u'video.google:search'
1495     _SEARCH_KEY = 'gvsearch'
1496
1497     def _get_n_results(self, query, n):
1498         """Get a specified number of results for a query"""
1499
1500         res = {
1501             '_type': 'playlist',
1502             'id': query,
1503             'entries': []
1504         }
1505
1506         for pagenum in itertools.count(1):
1507             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1508             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1509                                              note='Downloading result page ' + str(pagenum))
1510
1511             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1512                 e = {
1513                     '_type': 'url',
1514                     'url': mobj.group(1)
1515                 }
1516                 res['entries'].append(e)
1517
1518             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1519                 return res
1520
1521 class YahooSearchIE(SearchInfoExtractor):
1522     """Information Extractor for Yahoo! Video search queries."""
1523
1524     _MAX_RESULTS = 1000
1525     IE_NAME = u'screen.yahoo:search'
1526     _SEARCH_KEY = 'yvsearch'
1527
1528     def _get_n_results(self, query, n):
1529         """Get a specified number of results for a query"""
1530
1531         res = {
1532             '_type': 'playlist',
1533             'id': query,
1534             'entries': []
1535         }
1536         for pagenum in itertools.count(0):
1537             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1538             webpage = self._download_webpage(result_url, query,
1539                                              note='Downloading results page '+str(pagenum+1))
1540             info = json.loads(webpage)
1541             m = info[u'm']
1542             results = info[u'results']
1543
1544             for (i, r) in enumerate(results):
1545                 if (pagenum * 30) +i >= n:
1546                     break
1547                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1548                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1549                 res['entries'].append(e)
1550             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1551                 break
1552
1553         return res
1554
1555
1556 class YoutubePlaylistIE(InfoExtractor):
1557     """Information Extractor for YouTube playlists."""
1558
1559     _VALID_URL = r"""(?:
1560                         (?:https?://)?
1561                         (?:\w+\.)?
1562                         youtube\.com/
1563                         (?:
1564                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1565                            \? (?:.*?&)*? (?:p|a|list)=
1566                         |  p/
1567                         )
1568                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1569                         .*
1570                      |
1571                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1572                      )"""
1573     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1574     _MAX_RESULTS = 50
1575     IE_NAME = u'youtube:playlist'
1576
1577     @classmethod
1578     def suitable(cls, url):
1579         """Receives a URL and returns True if suitable for this IE."""
1580         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1581
1582     def _real_extract(self, url):
1583         # Extract playlist id
1584         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1585         if mobj is None:
1586             raise ExtractorError(u'Invalid URL: %s' % url)
1587
1588         # Download playlist videos from API
1589         playlist_id = mobj.group(1) or mobj.group(2)
1590         page_num = 1
1591         videos = []
1592
1593         while True:
1594             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1595             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1596
1597             try:
1598                 response = json.loads(page)
1599             except ValueError as err:
1600                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1601
1602             if 'feed' not in response:
1603                 raise ExtractorError(u'Got a malformed response from YouTube API')
1604             playlist_title = response['feed']['title']['$t']
1605             if 'entry' not in response['feed']:
1606                 # Number of videos is a multiple of self._MAX_RESULTS
1607                 break
1608
1609             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1610                         for entry in response['feed']['entry']
1611                         if 'content' in entry ]
1612
1613             if len(response['feed']['entry']) < self._MAX_RESULTS:
1614                 break
1615             page_num += 1
1616
1617         videos = [v[1] for v in sorted(videos)]
1618
1619         url_results = [self.url_result(url, 'Youtube') for url in videos]
1620         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1621
1622
1623 class YoutubeChannelIE(InfoExtractor):
1624     """Information Extractor for YouTube channels."""
1625
1626     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1627     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1628     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1629     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1630     IE_NAME = u'youtube:channel'
1631
1632     def extract_videos_from_page(self, page):
1633         ids_in_page = []
1634         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1635             if mobj.group(1) not in ids_in_page:
1636                 ids_in_page.append(mobj.group(1))
1637         return ids_in_page
1638
1639     def _real_extract(self, url):
1640         # Extract channel id
1641         mobj = re.match(self._VALID_URL, url)
1642         if mobj is None:
1643             raise ExtractorError(u'Invalid URL: %s' % url)
1644
1645         # Download channel page
1646         channel_id = mobj.group(1)
1647         video_ids = []
1648         pagenum = 1
1649
1650         url = self._TEMPLATE_URL % (channel_id, pagenum)
1651         page = self._download_webpage(url, channel_id,
1652                                       u'Downloading page #%s' % pagenum)
1653
1654         # Extract video identifiers
1655         ids_in_page = self.extract_videos_from_page(page)
1656         video_ids.extend(ids_in_page)
1657
1658         # Download any subsequent channel pages using the json-based channel_ajax query
1659         if self._MORE_PAGES_INDICATOR in page:
1660             while True:
1661                 pagenum = pagenum + 1
1662
1663                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1664                 page = self._download_webpage(url, channel_id,
1665                                               u'Downloading page #%s' % pagenum)
1666
1667                 page = json.loads(page)
1668
1669                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1670                 video_ids.extend(ids_in_page)
1671
1672                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1673                     break
1674
1675         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1676
1677         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1678         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1679         return [self.playlist_result(url_entries, channel_id)]
1680
1681
1682 class YoutubeUserIE(InfoExtractor):
1683     """Information Extractor for YouTube users."""
1684
1685     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1686     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1687     _GDATA_PAGE_SIZE = 50
1688     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1689     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1690     IE_NAME = u'youtube:user'
1691
1692     def _real_extract(self, url):
1693         # Extract username
1694         mobj = re.match(self._VALID_URL, url)
1695         if mobj is None:
1696             raise ExtractorError(u'Invalid URL: %s' % url)
1697
1698         username = mobj.group(1)
1699
1700         # Download video ids using YouTube Data API. Result size per
1701         # query is limited (currently to 50 videos) so we need to query
1702         # page by page until there are no video ids - it means we got
1703         # all of them.
1704
1705         video_ids = []
1706         pagenum = 0
1707
1708         while True:
1709             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1710
1711             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1712             page = self._download_webpage(gdata_url, username,
1713                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1714
1715             # Extract video identifiers
1716             ids_in_page = []
1717
1718             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1719                 if mobj.group(1) not in ids_in_page:
1720                     ids_in_page.append(mobj.group(1))
1721
1722             video_ids.extend(ids_in_page)
1723
1724             # A little optimization - if current page is not
1725             # "full", ie. does not contain PAGE_SIZE video ids then
1726             # we can assume that this page is the last one - there
1727             # are no more ids on further pages - no need to query
1728             # again.
1729
1730             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1731                 break
1732
1733             pagenum += 1
1734
1735         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1736         url_results = [self.url_result(url, 'Youtube') for url in urls]
1737         return [self.playlist_result(url_results, playlist_title = username)]
1738
1739
1740 class BlipTVUserIE(InfoExtractor):
1741     """Information Extractor for blip.tv users."""
1742
1743     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1744     _PAGE_SIZE = 12
1745     IE_NAME = u'blip.tv:user'
1746
1747     def _real_extract(self, url):
1748         # Extract username
1749         mobj = re.match(self._VALID_URL, url)
1750         if mobj is None:
1751             raise ExtractorError(u'Invalid URL: %s' % url)
1752
1753         username = mobj.group(1)
1754
1755         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1756
1757         page = self._download_webpage(url, username, u'Downloading user page')
1758         mobj = re.search(r'data-users-id="([^"]+)"', page)
1759         page_base = page_base % mobj.group(1)
1760
1761
1762         # Download video ids using BlipTV Ajax calls. Result size per
1763         # query is limited (currently to 12 videos) so we need to query
1764         # page by page until there are no video ids - it means we got
1765         # all of them.
1766
1767         video_ids = []
1768         pagenum = 1
1769
1770         while True:
1771             url = page_base + "&page=" + str(pagenum)
1772             page = self._download_webpage(url, username,
1773                                           u'Downloading video ids from page %d' % pagenum)
1774
1775             # Extract video identifiers
1776             ids_in_page = []
1777
1778             for mobj in re.finditer(r'href="/([^"]+)"', page):
1779                 if mobj.group(1) not in ids_in_page:
1780                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1781
1782             video_ids.extend(ids_in_page)
1783
1784             # A little optimization - if current page is not
1785             # "full", ie. does not contain PAGE_SIZE video ids then
1786             # we can assume that this page is the last one - there
1787             # are no more ids on further pages - no need to query
1788             # again.
1789
1790             if len(ids_in_page) < self._PAGE_SIZE:
1791                 break
1792
1793             pagenum += 1
1794
1795         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1796         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1797         return [self.playlist_result(url_entries, playlist_title = username)]
1798
1799
1800 class DepositFilesIE(InfoExtractor):
1801     """Information extractor for depositfiles.com"""
1802
1803     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1804
1805     def _real_extract(self, url):
1806         file_id = url.split('/')[-1]
1807         # Rebuild url in english locale
1808         url = 'http://depositfiles.com/en/files/' + file_id
1809
1810         # Retrieve file webpage with 'Free download' button pressed
1811         free_download_indication = { 'gateway_result' : '1' }
1812         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1813         try:
1814             self.report_download_webpage(file_id)
1815             webpage = compat_urllib_request.urlopen(request).read()
1816         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1817             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1818
1819         # Search for the real file URL
1820         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1821         if (mobj is None) or (mobj.group(1) is None):
1822             # Try to figure out reason of the error.
1823             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1824             if (mobj is not None) and (mobj.group(1) is not None):
1825                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1826                 raise ExtractorError(u'%s' % restriction_message)
1827             else:
1828                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1829
1830         file_url = mobj.group(1)
1831         file_extension = os.path.splitext(file_url)[1][1:]
1832
1833         # Search for file title
1834         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1835
1836         return [{
1837             'id':       file_id.decode('utf-8'),
1838             'url':      file_url.decode('utf-8'),
1839             'uploader': None,
1840             'upload_date':  None,
1841             'title':    file_title,
1842             'ext':      file_extension.decode('utf-8'),
1843         }]
1844
1845
1846 class FacebookIE(InfoExtractor):
1847     """Information Extractor for Facebook"""
1848
1849     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1850     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1851     _NETRC_MACHINE = 'facebook'
1852     IE_NAME = u'facebook'
1853
1854     def report_login(self):
1855         """Report attempt to log in."""
1856         self.to_screen(u'Logging in')
1857
1858     def _real_initialize(self):
1859         if self._downloader is None:
1860             return
1861
1862         useremail = None
1863         password = None
1864         downloader_params = self._downloader.params
1865
1866         # Attempt to use provided username and password or .netrc data
1867         if downloader_params.get('username', None) is not None:
1868             useremail = downloader_params['username']
1869             password = downloader_params['password']
1870         elif downloader_params.get('usenetrc', False):
1871             try:
1872                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1873                 if info is not None:
1874                     useremail = info[0]
1875                     password = info[2]
1876                 else:
1877                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1878             except (IOError, netrc.NetrcParseError) as err:
1879                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1880                 return
1881
1882         if useremail is None:
1883             return
1884
1885         # Log in
1886         login_form = {
1887             'email': useremail,
1888             'pass': password,
1889             'login': 'Log+In'
1890             }
1891         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1892         try:
1893             self.report_login()
1894             login_results = compat_urllib_request.urlopen(request).read()
1895             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1896                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1897                 return
1898         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1899             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1900             return
1901
1902     def _real_extract(self, url):
1903         mobj = re.match(self._VALID_URL, url)
1904         if mobj is None:
1905             raise ExtractorError(u'Invalid URL: %s' % url)
1906         video_id = mobj.group('ID')
1907
1908         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1909         webpage = self._download_webpage(url, video_id)
1910
1911         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1912         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1913         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1914         if not m:
1915             raise ExtractorError(u'Cannot parse data')
1916         data = dict(json.loads(m.group(1)))
1917         params_raw = compat_urllib_parse.unquote(data['params'])
1918         params = json.loads(params_raw)
1919         video_data = params['video_data'][0]
1920         video_url = video_data.get('hd_src')
1921         if not video_url:
1922             video_url = video_data['sd_src']
1923         if not video_url:
1924             raise ExtractorError(u'Cannot find video URL')
1925         video_duration = int(video_data['video_duration'])
1926         thumbnail = video_data['thumbnail_src']
1927
1928         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1929             webpage, u'title')
1930
1931         info = {
1932             'id': video_id,
1933             'title': video_title,
1934             'url': video_url,
1935             'ext': 'mp4',
1936             'duration': video_duration,
1937             'thumbnail': thumbnail,
1938         }
1939         return [info]
1940
1941
1942 class BlipTVIE(InfoExtractor):
1943     """Information extractor for blip.tv"""
1944
1945     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1946     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1947     IE_NAME = u'blip.tv'
1948
1949     def report_direct_download(self, title):
1950         """Report information extraction."""
1951         self.to_screen(u'%s: Direct download detected' % title)
1952
1953     def _real_extract(self, url):
1954         mobj = re.match(self._VALID_URL, url)
1955         if mobj is None:
1956             raise ExtractorError(u'Invalid URL: %s' % url)
1957
1958         # See https://github.com/rg3/youtube-dl/issues/857
1959         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1960         if api_mobj is not None:
1961             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1962         urlp = compat_urllib_parse_urlparse(url)
1963         if urlp.path.startswith('/play/'):
1964             request = compat_urllib_request.Request(url)
1965             response = compat_urllib_request.urlopen(request)
1966             redirecturl = response.geturl()
1967             rurlp = compat_urllib_parse_urlparse(redirecturl)
1968             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1969             url = 'http://blip.tv/a/a-' + file_id
1970             return self._real_extract(url)
1971
1972
1973         if '?' in url:
1974             cchar = '&'
1975         else:
1976             cchar = '?'
1977         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1978         request = compat_urllib_request.Request(json_url)
1979         request.add_header('User-Agent', 'iTunes/10.6.1')
1980         self.report_extraction(mobj.group(1))
1981         info = None
1982         try:
1983             urlh = compat_urllib_request.urlopen(request)
1984             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1985                 basename = url.split('/')[-1]
1986                 title,ext = os.path.splitext(basename)
1987                 title = title.decode('UTF-8')
1988                 ext = ext.replace('.', '')
1989                 self.report_direct_download(title)
1990                 info = {
1991                     'id': title,
1992                     'url': url,
1993                     'uploader': None,
1994                     'upload_date': None,
1995                     'title': title,
1996                     'ext': ext,
1997                     'urlhandle': urlh
1998                 }
1999         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2000             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2001         if info is None: # Regular URL
2002             try:
2003                 json_code_bytes = urlh.read()
2004                 json_code = json_code_bytes.decode('utf-8')
2005             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2006                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2007
2008             try:
2009                 json_data = json.loads(json_code)
2010                 if 'Post' in json_data:
2011                     data = json_data['Post']
2012                 else:
2013                     data = json_data
2014
2015                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2016                 video_url = data['media']['url']
2017                 umobj = re.match(self._URL_EXT, video_url)
2018                 if umobj is None:
2019                     raise ValueError('Can not determine filename extension')
2020                 ext = umobj.group(1)
2021
2022                 info = {
2023                     'id': data['item_id'],
2024                     'url': video_url,
2025                     'uploader': data['display_name'],
2026                     'upload_date': upload_date,
2027                     'title': data['title'],
2028                     'ext': ext,
2029                     'format': data['media']['mimeType'],
2030                     'thumbnail': data['thumbnailUrl'],
2031                     'description': data['description'],
2032                     'player_url': data['embedUrl'],
2033                     'user_agent': 'iTunes/10.6.1',
2034                 }
2035             except (ValueError,KeyError) as err:
2036                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2037
2038         return [info]
2039
2040
2041 class MyVideoIE(InfoExtractor):
2042     """Information Extractor for myvideo.de."""
2043
2044     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2045     IE_NAME = u'myvideo'
2046
2047     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2048     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2049     # https://github.com/rg3/youtube-dl/pull/842
2050     def __rc4crypt(self,data, key):
2051         x = 0
2052         box = list(range(256))
2053         for i in list(range(256)):
2054             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2055             box[i], box[x] = box[x], box[i]
2056         x = 0
2057         y = 0
2058         out = ''
2059         for char in data:
2060             x = (x + 1) % 256
2061             y = (y + box[x]) % 256
2062             box[x], box[y] = box[y], box[x]
2063             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2064         return out
2065
2066     def __md5(self,s):
2067         return hashlib.md5(s).hexdigest().encode()
2068
2069     def _real_extract(self,url):
2070         mobj = re.match(self._VALID_URL, url)
2071         if mobj is None:
2072             raise ExtractorError(u'invalid URL: %s' % url)
2073
2074         video_id = mobj.group(1)
2075
2076         GK = (
2077           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2078           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2079           b'TnpsbA0KTVRkbU1tSTRNdz09'
2080         )
2081
2082         # Get video webpage
2083         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2084         webpage = self._download_webpage(webpage_url, video_id)
2085
2086         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2087         if mobj is not None:
2088             self.report_extraction(video_id)
2089             video_url = mobj.group(1) + '.flv'
2090
2091             video_title = self._html_search_regex('<title>([^<]+)</title>',
2092                 webpage, u'title')
2093
2094             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2095
2096             return [{
2097                 'id':       video_id,
2098                 'url':      video_url,
2099                 'uploader': None,
2100                 'upload_date':  None,
2101                 'title':    video_title,
2102                 'ext':      u'flv',
2103             }]
2104
2105         # try encxml
2106         mobj = re.search('var flashvars={(.+?)}', webpage)
2107         if mobj is None:
2108             raise ExtractorError(u'Unable to extract video')
2109
2110         params = {}
2111         encxml = ''
2112         sec = mobj.group(1)
2113         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2114             if not a == '_encxml':
2115                 params[a] = b
2116             else:
2117                 encxml = compat_urllib_parse.unquote(b)
2118         if not params.get('domain'):
2119             params['domain'] = 'www.myvideo.de'
2120         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2121         if 'flash_playertype=MTV' in xmldata_url:
2122             self._downloader.report_warning(u'avoiding MTV player')
2123             xmldata_url = (
2124                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2125                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2126             ) % video_id
2127
2128         # get enc data
2129         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2130         enc_data_b = binascii.unhexlify(enc_data)
2131         sk = self.__md5(
2132             base64.b64decode(base64.b64decode(GK)) +
2133             self.__md5(
2134                 str(video_id).encode('utf-8')
2135             )
2136         )
2137         dec_data = self.__rc4crypt(enc_data_b, sk)
2138
2139         # extracting infos
2140         self.report_extraction(video_id)
2141
2142         video_url = None
2143         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2144         if mobj:
2145             video_url = compat_urllib_parse.unquote(mobj.group(1))
2146             if 'myvideo2flash' in video_url:
2147                 self._downloader.report_warning(u'forcing RTMPT ...')
2148                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2149
2150         if not video_url:
2151             # extract non rtmp videos
2152             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2153             if mobj is None:
2154                 raise ExtractorError(u'unable to extract url')
2155             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2156
2157         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2158         video_file = compat_urllib_parse.unquote(video_file)
2159
2160         if not video_file.endswith('f4m'):
2161             ppath, prefix = video_file.split('.')
2162             video_playpath = '%s:%s' % (prefix, ppath)
2163             video_hls_playlist = ''
2164         else:
2165             video_playpath = ''
2166             video_hls_playlist = (
2167                 video_filepath + video_file
2168             ).replace('.f4m', '.m3u8')
2169
2170         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2171         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2172
2173         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2174             webpage, u'title')
2175
2176         return [{
2177             'id':                 video_id,
2178             'url':                video_url,
2179             'tc_url':             video_url,
2180             'uploader':           None,
2181             'upload_date':        None,
2182             'title':              video_title,
2183             'ext':                u'flv',
2184             'play_path':          video_playpath,
2185             'video_file':         video_file,
2186             'video_hls_playlist': video_hls_playlist,
2187             'player_url':         video_swfobj,
2188         }]
2189
2190
2191 class ComedyCentralIE(InfoExtractor):
2192     """Information extractor for The Daily Show and Colbert Report """
2193
2194     # urls can be abbreviations like :thedailyshow or :colbert
2195     # urls for episodes like:
2196     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2197     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2198     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2199     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2200                       |(https?://)?(www\.)?
2201                           (?P<showname>thedailyshow|colbertnation)\.com/
2202                          (full-episodes/(?P<episode>.*)|
2203                           (?P<clip>
2204                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2205                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2206                      $"""
2207
2208     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2209
2210     _video_extensions = {
2211         '3500': 'mp4',
2212         '2200': 'mp4',
2213         '1700': 'mp4',
2214         '1200': 'mp4',
2215         '750': 'mp4',
2216         '400': 'mp4',
2217     }
2218     _video_dimensions = {
2219         '3500': '1280x720',
2220         '2200': '960x540',
2221         '1700': '768x432',
2222         '1200': '640x360',
2223         '750': '512x288',
2224         '400': '384x216',
2225     }
2226
2227     @classmethod
2228     def suitable(cls, url):
2229         """Receives a URL and returns True if suitable for this IE."""
2230         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2231
2232     def _print_formats(self, formats):
2233         print('Available formats:')
2234         for x in formats:
2235             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2236
2237
2238     def _real_extract(self, url):
2239         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2240         if mobj is None:
2241             raise ExtractorError(u'Invalid URL: %s' % url)
2242
2243         if mobj.group('shortname'):
2244             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2245                 url = u'http://www.thedailyshow.com/full-episodes/'
2246             else:
2247                 url = u'http://www.colbertnation.com/full-episodes/'
2248             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2249             assert mobj is not None
2250
2251         if mobj.group('clip'):
2252             if mobj.group('showname') == 'thedailyshow':
2253                 epTitle = mobj.group('tdstitle')
2254             else:
2255                 epTitle = mobj.group('cntitle')
2256             dlNewest = False
2257         else:
2258             dlNewest = not mobj.group('episode')
2259             if dlNewest:
2260                 epTitle = mobj.group('showname')
2261             else:
2262                 epTitle = mobj.group('episode')
2263
2264         self.report_extraction(epTitle)
2265         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2266         if dlNewest:
2267             url = htmlHandle.geturl()
2268             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2269             if mobj is None:
2270                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2271             if mobj.group('episode') == '':
2272                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2273             epTitle = mobj.group('episode')
2274
2275         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2276
2277         if len(mMovieParams) == 0:
2278             # The Colbert Report embeds the information in a without
2279             # a URL prefix; so extract the alternate reference
2280             # and then add the URL prefix manually.
2281
2282             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2283             if len(altMovieParams) == 0:
2284                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2285             else:
2286                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2287
2288         uri = mMovieParams[0][1]
2289         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2290         indexXml = self._download_webpage(indexUrl, epTitle,
2291                                           u'Downloading show index',
2292                                           u'unable to download episode index')
2293
2294         results = []
2295
2296         idoc = xml.etree.ElementTree.fromstring(indexXml)
2297         itemEls = idoc.findall('.//item')
2298         for partNum,itemEl in enumerate(itemEls):
2299             mediaId = itemEl.findall('./guid')[0].text
2300             shortMediaId = mediaId.split(':')[-1]
2301             showId = mediaId.split(':')[-2].replace('.com', '')
2302             officialTitle = itemEl.findall('./title')[0].text
2303             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2304
2305             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2306                         compat_urllib_parse.urlencode({'uri': mediaId}))
2307             configXml = self._download_webpage(configUrl, epTitle,
2308                                                u'Downloading configuration for %s' % shortMediaId)
2309
2310             cdoc = xml.etree.ElementTree.fromstring(configXml)
2311             turls = []
2312             for rendition in cdoc.findall('.//rendition'):
2313                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2314                 turls.append(finfo)
2315
2316             if len(turls) == 0:
2317                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2318                 continue
2319
2320             if self._downloader.params.get('listformats', None):
2321                 self._print_formats([i[0] for i in turls])
2322                 return
2323
2324             # For now, just pick the highest bitrate
2325             format,rtmp_video_url = turls[-1]
2326
2327             # Get the format arg from the arg stream
2328             req_format = self._downloader.params.get('format', None)
2329
2330             # Select format if we can find one
2331             for f,v in turls:
2332                 if f == req_format:
2333                     format, rtmp_video_url = f, v
2334                     break
2335
2336             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2337             if not m:
2338                 raise ExtractorError(u'Cannot transform RTMP url')
2339             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2340             video_url = base + m.group('finalid')
2341
2342             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2343             info = {
2344                 'id': shortMediaId,
2345                 'url': video_url,
2346                 'uploader': showId,
2347                 'upload_date': officialDate,
2348                 'title': effTitle,
2349                 'ext': 'mp4',
2350                 'format': format,
2351                 'thumbnail': None,
2352                 'description': officialTitle,
2353             }
2354             results.append(info)
2355
2356         return results
2357
2358
2359 class EscapistIE(InfoExtractor):
2360     """Information extractor for The Escapist """
2361
2362     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2363     IE_NAME = u'escapist'
2364
2365     def _real_extract(self, url):
2366         mobj = re.match(self._VALID_URL, url)
2367         if mobj is None:
2368             raise ExtractorError(u'Invalid URL: %s' % url)
2369         showName = mobj.group('showname')
2370         videoId = mobj.group('episode')
2371
2372         self.report_extraction(videoId)
2373         webpage = self._download_webpage(url, videoId)
2374
2375         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2376             webpage, u'description', fatal=False)
2377
2378         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2379             webpage, u'thumbnail', fatal=False)
2380
2381         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2382             webpage, u'player url')
2383
2384         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2385             webpage, u'player url').split(' : ')[-1]
2386
2387         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2388         configUrl = compat_urllib_parse.unquote(configUrl)
2389
2390         configJSON = self._download_webpage(configUrl, videoId,
2391                                             u'Downloading configuration',
2392                                             u'unable to download configuration')
2393
2394         # Technically, it's JavaScript, not JSON
2395         configJSON = configJSON.replace("'", '"')
2396
2397         try:
2398             config = json.loads(configJSON)
2399         except (ValueError,) as err:
2400             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2401
2402         playlist = config['playlist']
2403         videoUrl = playlist[1]['url']
2404
2405         info = {
2406             'id': videoId,
2407             'url': videoUrl,
2408             'uploader': showName,
2409             'upload_date': None,
2410             'title': title,
2411             'ext': 'mp4',
2412             'thumbnail': imgUrl,
2413             'description': videoDesc,
2414             'player_url': playerUrl,
2415         }
2416
2417         return [info]
2418
2419 class CollegeHumorIE(InfoExtractor):
2420     """Information extractor for collegehumor.com"""
2421
2422     _WORKING = False
2423     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2424     IE_NAME = u'collegehumor'
2425
2426     def report_manifest(self, video_id):
2427         """Report information extraction."""
2428         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2429
2430     def _real_extract(self, url):
2431         mobj = re.match(self._VALID_URL, url)
2432         if mobj is None:
2433             raise ExtractorError(u'Invalid URL: %s' % url)
2434         video_id = mobj.group('videoid')
2435
2436         info = {
2437             'id': video_id,
2438             'uploader': None,
2439             'upload_date': None,
2440         }
2441
2442         self.report_extraction(video_id)
2443         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2444         try:
2445             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2446         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2447             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2448
2449         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2450         try:
2451             videoNode = mdoc.findall('./video')[0]
2452             info['description'] = videoNode.findall('./description')[0].text
2453             info['title'] = videoNode.findall('./caption')[0].text
2454             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2455             manifest_url = videoNode.findall('./file')[0].text
2456         except IndexError:
2457             raise ExtractorError(u'Invalid metadata XML file')
2458
2459         manifest_url += '?hdcore=2.10.3'
2460         self.report_manifest(video_id)
2461         try:
2462             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2463         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2464             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2465
2466         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2467         try:
2468             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2469             node_id = media_node.attrib['url']
2470             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2471         except IndexError as err:
2472             raise ExtractorError(u'Invalid manifest file')
2473
2474         url_pr = compat_urllib_parse_urlparse(manifest_url)
2475         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2476
2477         info['url'] = url
2478         info['ext'] = 'f4f'
2479         return [info]
2480
2481
2482 class XVideosIE(InfoExtractor):
2483     """Information extractor for xvideos.com"""
2484
2485     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2486     IE_NAME = u'xvideos'
2487
2488     def _real_extract(self, url):
2489         mobj = re.match(self._VALID_URL, url)
2490         if mobj is None:
2491             raise ExtractorError(u'Invalid URL: %s' % url)
2492         video_id = mobj.group(1)
2493
2494         webpage = self._download_webpage(url, video_id)
2495
2496         self.report_extraction(video_id)
2497
2498         # Extract video URL
2499         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2500             webpage, u'video URL'))
2501
2502         # Extract title
2503         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2504             webpage, u'title')
2505
2506         # Extract video thumbnail
2507         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2508             webpage, u'thumbnail', fatal=False)
2509
2510         info = {
2511             'id': video_id,
2512             'url': video_url,
2513             'uploader': None,
2514             'upload_date': None,
2515             'title': video_title,
2516             'ext': 'flv',
2517             'thumbnail': video_thumbnail,
2518             'description': None,
2519         }
2520
2521         return [info]
2522
2523
2524 class SoundcloudIE(InfoExtractor):
2525     """Information extractor for soundcloud.com
2526        To access the media, the uid of the song and a stream token
2527        must be extracted from the page source and the script must make
2528        a request to media.soundcloud.com/crossdomain.xml. Then
2529        the media can be grabbed by requesting from an url composed
2530        of the stream token and uid
2531      """
2532
2533     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2534     IE_NAME = u'soundcloud'
2535
2536     def report_resolve(self, video_id):
2537         """Report information extraction."""
2538         self.to_screen(u'%s: Resolving id' % video_id)
2539
2540     def _real_extract(self, url):
2541         mobj = re.match(self._VALID_URL, url)
2542         if mobj is None:
2543             raise ExtractorError(u'Invalid URL: %s' % url)
2544
2545         # extract uploader (which is in the url)
2546         uploader = mobj.group(1)
2547         # extract simple title (uploader + slug of song title)
2548         slug_title =  mobj.group(2)
2549         simple_title = uploader + u'-' + slug_title
2550         full_title = '%s/%s' % (uploader, slug_title)
2551
2552         self.report_resolve(full_title)
2553
2554         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2555         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2556         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2557
2558         info = json.loads(info_json)
2559         video_id = info['id']
2560         self.report_extraction(full_title)
2561
2562         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2563         stream_json = self._download_webpage(streams_url, full_title,
2564                                              u'Downloading stream definitions',
2565                                              u'unable to download stream definitions')
2566
2567         streams = json.loads(stream_json)
2568         mediaURL = streams['http_mp3_128_url']
2569         upload_date = unified_strdate(info['created_at'])
2570
2571         return [{
2572             'id':       info['id'],
2573             'url':      mediaURL,
2574             'uploader': info['user']['username'],
2575             'upload_date': upload_date,
2576             'title':    info['title'],
2577             'ext':      u'mp3',
2578             'description': info['description'],
2579         }]
2580
2581 class SoundcloudSetIE(InfoExtractor):
2582     """Information extractor for soundcloud.com sets
2583        To access the media, the uid of the song and a stream token
2584        must be extracted from the page source and the script must make
2585        a request to media.soundcloud.com/crossdomain.xml. Then
2586        the media can be grabbed by requesting from an url composed
2587        of the stream token and uid
2588      """
2589
2590     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2591     IE_NAME = u'soundcloud:set'
2592
2593     def report_resolve(self, video_id):
2594         """Report information extraction."""
2595         self.to_screen(u'%s: Resolving id' % video_id)
2596
2597     def _real_extract(self, url):
2598         mobj = re.match(self._VALID_URL, url)
2599         if mobj is None:
2600             raise ExtractorError(u'Invalid URL: %s' % url)
2601
2602         # extract uploader (which is in the url)
2603         uploader = mobj.group(1)
2604         # extract simple title (uploader + slug of song title)
2605         slug_title =  mobj.group(2)
2606         simple_title = uploader + u'-' + slug_title
2607         full_title = '%s/sets/%s' % (uploader, slug_title)
2608
2609         self.report_resolve(full_title)
2610
2611         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2612         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2613         info_json = self._download_webpage(resolv_url, full_title)
2614
2615         videos = []
2616         info = json.loads(info_json)
2617         if 'errors' in info:
2618             for err in info['errors']:
2619                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2620             return
2621
2622         self.report_extraction(full_title)
2623         for track in info['tracks']:
2624             video_id = track['id']
2625
2626             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2627             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2628
2629             self.report_extraction(video_id)
2630             streams = json.loads(stream_json)
2631             mediaURL = streams['http_mp3_128_url']
2632
2633             videos.append({
2634                 'id':       video_id,
2635                 'url':      mediaURL,
2636                 'uploader': track['user']['username'],
2637                 'upload_date':  unified_strdate(track['created_at']),
2638                 'title':    track['title'],
2639                 'ext':      u'mp3',
2640                 'description': track['description'],
2641             })
2642         return videos
2643
2644
2645 class InfoQIE(InfoExtractor):
2646     """Information extractor for infoq.com"""
2647     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2648
2649     def _real_extract(self, url):
2650         mobj = re.match(self._VALID_URL, url)
2651         if mobj is None:
2652             raise ExtractorError(u'Invalid URL: %s' % url)
2653
2654         webpage = self._download_webpage(url, video_id=url)
2655         self.report_extraction(url)
2656
2657         # Extract video URL
2658         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2659         if mobj is None:
2660             raise ExtractorError(u'Unable to extract video url')
2661         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2662         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2663
2664         # Extract title
2665         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2666             webpage, u'title')
2667
2668         # Extract description
2669         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2670             webpage, u'description', fatal=False)
2671
2672         video_filename = video_url.split('/')[-1]
2673         video_id, extension = video_filename.split('.')
2674
2675         info = {
2676             'id': video_id,
2677             'url': video_url,
2678             'uploader': None,
2679             'upload_date': None,
2680             'title': video_title,
2681             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2682             'thumbnail': None,
2683             'description': video_description,
2684         }
2685
2686         return [info]
2687
2688 class MixcloudIE(InfoExtractor):
2689     """Information extractor for www.mixcloud.com"""
2690
2691     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2692     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2693     IE_NAME = u'mixcloud'
2694
2695     def report_download_json(self, file_id):
2696         """Report JSON download."""
2697         self.to_screen(u'Downloading json')
2698
2699     def get_urls(self, jsonData, fmt, bitrate='best'):
2700         """Get urls from 'audio_formats' section in json"""
2701         file_url = None
2702         try:
2703             bitrate_list = jsonData[fmt]
2704             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2705                 bitrate = max(bitrate_list) # select highest
2706
2707             url_list = jsonData[fmt][bitrate]
2708         except TypeError: # we have no bitrate info.
2709             url_list = jsonData[fmt]
2710         return url_list
2711
2712     def check_urls(self, url_list):
2713         """Returns 1st active url from list"""
2714         for url in url_list:
2715             try:
2716                 compat_urllib_request.urlopen(url)
2717                 return url
2718             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2719                 url = None
2720
2721         return None
2722
2723     def _print_formats(self, formats):
2724         print('Available formats:')
2725         for fmt in formats.keys():
2726             for b in formats[fmt]:
2727                 try:
2728                     ext = formats[fmt][b][0]
2729                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2730                 except TypeError: # we have no bitrate info
2731                     ext = formats[fmt][0]
2732                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2733                     break
2734
2735     def _real_extract(self, url):
2736         mobj = re.match(self._VALID_URL, url)
2737         if mobj is None:
2738             raise ExtractorError(u'Invalid URL: %s' % url)
2739         # extract uploader & filename from url
2740         uploader = mobj.group(1).decode('utf-8')
2741         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2742
2743         # construct API request
2744         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2745         # retrieve .json file with links to files
2746         request = compat_urllib_request.Request(file_url)
2747         try:
2748             self.report_download_json(file_url)
2749             jsonData = compat_urllib_request.urlopen(request).read()
2750         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2751             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2752
2753         # parse JSON
2754         json_data = json.loads(jsonData)
2755         player_url = json_data['player_swf_url']
2756         formats = dict(json_data['audio_formats'])
2757
2758         req_format = self._downloader.params.get('format', None)
2759         bitrate = None
2760
2761         if self._downloader.params.get('listformats', None):
2762             self._print_formats(formats)
2763             return
2764
2765         if req_format is None or req_format == 'best':
2766             for format_param in formats.keys():
2767                 url_list = self.get_urls(formats, format_param)
2768                 # check urls
2769                 file_url = self.check_urls(url_list)
2770                 if file_url is not None:
2771                     break # got it!
2772         else:
2773             if req_format not in formats:
2774                 raise ExtractorError(u'Format is not available')
2775
2776             url_list = self.get_urls(formats, req_format)
2777             file_url = self.check_urls(url_list)
2778             format_param = req_format
2779
2780         return [{
2781             'id': file_id.decode('utf-8'),
2782             'url': file_url.decode('utf-8'),
2783             'uploader': uploader.decode('utf-8'),
2784             'upload_date': None,
2785             'title': json_data['name'],
2786             'ext': file_url.split('.')[-1].decode('utf-8'),
2787             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2788             'thumbnail': json_data['thumbnail_url'],
2789             'description': json_data['description'],
2790             'player_url': player_url.decode('utf-8'),
2791         }]
2792
2793 class StanfordOpenClassroomIE(InfoExtractor):
2794     """Information extractor for Stanford's Open ClassRoom"""
2795
2796     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2797     IE_NAME = u'stanfordoc'
2798
2799     def _real_extract(self, url):
2800         mobj = re.match(self._VALID_URL, url)
2801         if mobj is None:
2802             raise ExtractorError(u'Invalid URL: %s' % url)
2803
2804         if mobj.group('course') and mobj.group('video'): # A specific video
2805             course = mobj.group('course')
2806             video = mobj.group('video')
2807             info = {
2808                 'id': course + '_' + video,
2809                 'uploader': None,
2810                 'upload_date': None,
2811             }
2812
2813             self.report_extraction(info['id'])
2814             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2815             xmlUrl = baseUrl + video + '.xml'
2816             try:
2817                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2818             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2819                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2820             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2821             try:
2822                 info['title'] = mdoc.findall('./title')[0].text
2823                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2824             except IndexError:
2825                 raise ExtractorError(u'Invalid metadata XML file')
2826             info['ext'] = info['url'].rpartition('.')[2]
2827             return [info]
2828         elif mobj.group('course'): # A course page
2829             course = mobj.group('course')
2830             info = {
2831                 'id': course,
2832                 'type': 'playlist',
2833                 'uploader': None,
2834                 'upload_date': None,
2835             }
2836
2837             coursepage = self._download_webpage(url, info['id'],
2838                                         note='Downloading course info page',
2839                                         errnote='Unable to download course info page')
2840
2841             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2842
2843             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2844                 coursepage, u'description', fatal=False)
2845
2846             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2847             info['list'] = [
2848                 {
2849                     'type': 'reference',
2850                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2851                 }
2852                     for vpage in links]
2853             results = []
2854             for entry in info['list']:
2855                 assert entry['type'] == 'reference'
2856                 results += self.extract(entry['url'])
2857             return results
2858         else: # Root page
2859             info = {
2860                 'id': 'Stanford OpenClassroom',
2861                 'type': 'playlist',
2862                 'uploader': None,
2863                 'upload_date': None,
2864             }
2865
2866             self.report_download_webpage(info['id'])
2867             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2868             try:
2869                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2870             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2871                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2872
2873             info['title'] = info['id']
2874
2875             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2876             info['list'] = [
2877                 {
2878                     'type': 'reference',
2879                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2880                 }
2881                     for cpage in links]
2882
2883             results = []
2884             for entry in info['list']:
2885                 assert entry['type'] == 'reference'
2886                 results += self.extract(entry['url'])
2887             return results
2888
2889 class MTVIE(InfoExtractor):
2890     """Information extractor for MTV.com"""
2891
2892     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2893     IE_NAME = u'mtv'
2894
2895     def _real_extract(self, url):
2896         mobj = re.match(self._VALID_URL, url)
2897         if mobj is None:
2898             raise ExtractorError(u'Invalid URL: %s' % url)
2899         if not mobj.group('proto'):
2900             url = 'http://' + url
2901         video_id = mobj.group('videoid')
2902
2903         webpage = self._download_webpage(url, video_id)
2904
2905         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2906             webpage, u'song name', fatal=False)
2907
2908         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2909             webpage, u'title')
2910
2911         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2912             webpage, u'mtvn_uri', fatal=False)
2913
2914         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2915             webpage, u'content id', fatal=False)
2916
2917         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2918         self.report_extraction(video_id)
2919         request = compat_urllib_request.Request(videogen_url)
2920         try:
2921             metadataXml = compat_urllib_request.urlopen(request).read()
2922         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2923             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2924
2925         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2926         renditions = mdoc.findall('.//rendition')
2927
2928         # For now, always pick the highest quality.
2929         rendition = renditions[-1]
2930
2931         try:
2932             _,_,ext = rendition.attrib['type'].partition('/')
2933             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2934             video_url = rendition.find('./src').text
2935         except KeyError:
2936             raise ExtractorError('Invalid rendition field.')
2937
2938         info = {
2939             'id': video_id,
2940             'url': video_url,
2941             'uploader': performer,
2942             'upload_date': None,
2943             'title': video_title,
2944             'ext': ext,
2945             'format': format,
2946         }
2947
2948         return [info]
2949
2950
2951 class YoukuIE(InfoExtractor):
2952     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2953
2954     def _gen_sid(self):
2955         nowTime = int(time.time() * 1000)
2956         random1 = random.randint(1000,1998)
2957         random2 = random.randint(1000,9999)
2958
2959         return "%d%d%d" %(nowTime,random1,random2)
2960
2961     def _get_file_ID_mix_string(self, seed):
2962         mixed = []
2963         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2964         seed = float(seed)
2965         for i in range(len(source)):
2966             seed  =  (seed * 211 + 30031 ) % 65536
2967             index  =  math.floor(seed / 65536 * len(source) )
2968             mixed.append(source[int(index)])
2969             source.remove(source[int(index)])
2970         #return ''.join(mixed)
2971         return mixed
2972
2973     def _get_file_id(self, fileId, seed):
2974         mixed = self._get_file_ID_mix_string(seed)
2975         ids = fileId.split('*')
2976         realId = []
2977         for ch in ids:
2978             if ch:
2979                 realId.append(mixed[int(ch)])
2980         return ''.join(realId)
2981
2982     def _real_extract(self, url):
2983         mobj = re.match(self._VALID_URL, url)
2984         if mobj is None:
2985             raise ExtractorError(u'Invalid URL: %s' % url)
2986         video_id = mobj.group('ID')
2987
2988         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2989
2990         jsondata = self._download_webpage(info_url, video_id)
2991
2992         self.report_extraction(video_id)
2993         try:
2994             config = json.loads(jsondata)
2995
2996             video_title =  config['data'][0]['title']
2997             seed = config['data'][0]['seed']
2998
2999             format = self._downloader.params.get('format', None)
3000             supported_format = list(config['data'][0]['streamfileids'].keys())
3001
3002             if format is None or format == 'best':
3003                 if 'hd2' in supported_format:
3004                     format = 'hd2'
3005                 else:
3006                     format = 'flv'
3007                 ext = u'flv'
3008             elif format == 'worst':
3009                 format = 'mp4'
3010                 ext = u'mp4'
3011             else:
3012                 format = 'flv'
3013                 ext = u'flv'
3014
3015
3016             fileid = config['data'][0]['streamfileids'][format]
3017             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3018         except (UnicodeDecodeError, ValueError, KeyError):
3019             raise ExtractorError(u'Unable to extract info section')
3020
3021         files_info=[]
3022         sid = self._gen_sid()
3023         fileid = self._get_file_id(fileid, seed)
3024
3025         #column 8,9 of fileid represent the segment number
3026         #fileid[7:9] should be changed
3027         for index, key in enumerate(keys):
3028
3029             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3030             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3031
3032             info = {
3033                 'id': '%s_part%02d' % (video_id, index),
3034                 'url': download_url,
3035                 'uploader': None,
3036                 'upload_date': None,
3037                 'title': video_title,
3038                 'ext': ext,
3039             }
3040             files_info.append(info)
3041
3042         return files_info
3043
3044
3045 class XNXXIE(InfoExtractor):
3046     """Information extractor for xnxx.com"""
3047
3048     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3049     IE_NAME = u'xnxx'
3050     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3051     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3052     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3053
3054     def _real_extract(self, url):
3055         mobj = re.match(self._VALID_URL, url)
3056         if mobj is None:
3057             raise ExtractorError(u'Invalid URL: %s' % url)
3058         video_id = mobj.group(1)
3059
3060         # Get webpage content
3061         webpage = self._download_webpage(url, video_id)
3062
3063         video_url = self._search_regex(self.VIDEO_URL_RE,
3064             webpage, u'video URL')
3065         video_url = compat_urllib_parse.unquote(video_url)
3066
3067         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3068             webpage, u'title')
3069
3070         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3071             webpage, u'thumbnail', fatal=False)
3072
3073         return [{
3074             'id': video_id,
3075             'url': video_url,
3076             'uploader': None,
3077             'upload_date': None,
3078             'title': video_title,
3079             'ext': 'flv',
3080             'thumbnail': video_thumbnail,
3081             'description': None,
3082         }]
3083
3084
3085 class GooglePlusIE(InfoExtractor):
3086     """Information extractor for plus.google.com."""
3087
3088     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3089     IE_NAME = u'plus.google'
3090
3091     def _real_extract(self, url):
3092         # Extract id from URL
3093         mobj = re.match(self._VALID_URL, url)
3094         if mobj is None:
3095             raise ExtractorError(u'Invalid URL: %s' % url)
3096
3097         post_url = mobj.group(0)
3098         video_id = mobj.group(1)
3099
3100         video_extension = 'flv'
3101
3102         # Step 1, Retrieve post webpage to extract further information
3103         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3104
3105         self.report_extraction(video_id)
3106
3107         # Extract update date
3108         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3109             webpage, u'upload date', fatal=False)
3110         if upload_date:
3111             # Convert timestring to a format suitable for filename
3112             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3113             upload_date = upload_date.strftime('%Y%m%d')
3114
3115         # Extract uploader
3116         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3117             webpage, u'uploader', fatal=False)
3118
3119         # Extract title
3120         # Get the first line for title
3121         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3122             webpage, 'title', default=u'NA')
3123
3124         # Step 2, Stimulate clicking the image box to launch video
3125         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3126             webpage, u'video page URL')
3127         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3128
3129         # Extract video links on video page
3130         """Extract video links of all sizes"""
3131         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3132         mobj = re.findall(pattern, webpage)
3133         if len(mobj) == 0:
3134             raise ExtractorError(u'Unable to extract video links')
3135
3136         # Sort in resolution
3137         links = sorted(mobj)
3138
3139         # Choose the lowest of the sort, i.e. highest resolution
3140         video_url = links[-1]
3141         # Only get the url. The resolution part in the tuple has no use anymore
3142         video_url = video_url[-1]
3143         # Treat escaped \u0026 style hex
3144         try:
3145             video_url = video_url.decode("unicode_escape")
3146         except AttributeError: # Python 3
3147             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3148
3149
3150         return [{
3151             'id':       video_id,
3152             'url':      video_url,
3153             'uploader': uploader,
3154             'upload_date':  upload_date,
3155             'title':    video_title,
3156             'ext':      video_extension,
3157         }]
3158
3159 class NBAIE(InfoExtractor):
3160     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3161     IE_NAME = u'nba'
3162
3163     def _real_extract(self, url):
3164         mobj = re.match(self._VALID_URL, url)
3165         if mobj is None:
3166             raise ExtractorError(u'Invalid URL: %s' % url)
3167
3168         video_id = mobj.group(1)
3169
3170         webpage = self._download_webpage(url, video_id)
3171
3172         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3173
3174         shortened_video_id = video_id.rpartition('/')[2]
3175         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3176             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3177
3178         # It isn't there in the HTML it returns to us
3179         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3180
3181         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3182
3183         info = {
3184             'id': shortened_video_id,
3185             'url': video_url,
3186             'ext': 'mp4',
3187             'title': title,
3188             # 'uploader_date': uploader_date,
3189             'description': description,
3190         }
3191         return [info]
3192
3193 class JustinTVIE(InfoExtractor):
3194     """Information extractor for justin.tv and twitch.tv"""
3195     # TODO: One broadcast may be split into multiple videos. The key
3196     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3197     # starts at 1 and increases. Can we treat all parts as one video?
3198
3199     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3200         (?:
3201             (?P<channelid>[^/]+)|
3202             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3203             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3204         )
3205         /?(?:\#.*)?$
3206         """
3207     _JUSTIN_PAGE_LIMIT = 100
3208     IE_NAME = u'justin.tv'
3209
3210     def report_download_page(self, channel, offset):
3211         """Report attempt to download a single page of videos."""
3212         self.to_screen(u'%s: Downloading video information from %d to %d' %
3213                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3214
3215     # Return count of items, list of *valid* items
3216     def _parse_page(self, url, video_id):
3217         webpage = self._download_webpage(url, video_id,
3218                                          u'Downloading video info JSON',
3219                                          u'unable to download video info JSON')
3220
3221         response = json.loads(webpage)
3222         if type(response) != list:
3223             error_text = response.get('error', 'unknown error')
3224             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3225         info = []
3226         for clip in response:
3227             video_url = clip['video_file_url']
3228             if video_url:
3229                 video_extension = os.path.splitext(video_url)[1][1:]
3230                 video_date = re.sub('-', '', clip['start_time'][:10])
3231                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3232                 video_id = clip['id']
3233                 video_title = clip.get('title', video_id)
3234                 info.append({
3235                     'id': video_id,
3236                     'url': video_url,
3237                     'title': video_title,
3238                     'uploader': clip.get('channel_name', video_uploader_id),
3239                     'uploader_id': video_uploader_id,
3240                     'upload_date': video_date,
3241                     'ext': video_extension,
3242                 })
3243         return (len(response), info)
3244
3245     def _real_extract(self, url):
3246         mobj = re.match(self._VALID_URL, url)
3247         if mobj is None:
3248             raise ExtractorError(u'invalid URL: %s' % url)
3249
3250         api_base = 'http://api.justin.tv'
3251         paged = False
3252         if mobj.group('channelid'):
3253             paged = True
3254             video_id = mobj.group('channelid')
3255             api = api_base + '/channel/archives/%s.json' % video_id
3256         elif mobj.group('chapterid'):
3257             chapter_id = mobj.group('chapterid')
3258
3259             webpage = self._download_webpage(url, chapter_id)
3260             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3261             if not m:
3262                 raise ExtractorError(u'Cannot find archive of a chapter')
3263             archive_id = m.group(1)
3264
3265             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3266             chapter_info_xml = self._download_webpage(api, chapter_id,
3267                                              note=u'Downloading chapter information',
3268                                              errnote=u'Chapter information download failed')
3269             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3270             for a in doc.findall('.//archive'):
3271                 if archive_id == a.find('./id').text:
3272                     break
3273             else:
3274                 raise ExtractorError(u'Could not find chapter in chapter information')
3275
3276             video_url = a.find('./video_file_url').text
3277             video_ext = video_url.rpartition('.')[2] or u'flv'
3278
3279             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3280             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3281                                    note='Downloading chapter metadata',
3282                                    errnote='Download of chapter metadata failed')
3283             chapter_info = json.loads(chapter_info_json)
3284
3285             bracket_start = int(doc.find('.//bracket_start').text)
3286             bracket_end = int(doc.find('.//bracket_end').text)
3287
3288             # TODO determine start (and probably fix up file)
3289             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3290             #video_url += u'?start=' + TODO:start_timestamp
3291             # bracket_start is 13290, but we want 51670615
3292             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3293                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3294
3295             info = {
3296                 'id': u'c' + chapter_id,
3297                 'url': video_url,
3298                 'ext': video_ext,
3299                 'title': chapter_info['title'],
3300                 'thumbnail': chapter_info['preview'],
3301                 'description': chapter_info['description'],
3302                 'uploader': chapter_info['channel']['display_name'],
3303                 'uploader_id': chapter_info['channel']['name'],
3304             }
3305             return [info]
3306         else:
3307             video_id = mobj.group('videoid')
3308             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3309
3310         self.report_extraction(video_id)
3311
3312         info = []
3313         offset = 0
3314         limit = self._JUSTIN_PAGE_LIMIT
3315         while True:
3316             if paged:
3317                 self.report_download_page(video_id, offset)
3318             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3319             page_count, page_info = self._parse_page(page_url, video_id)
3320             info.extend(page_info)
3321             if not paged or page_count != limit:
3322                 break
3323             offset += limit
3324         return info
3325
3326 class FunnyOrDieIE(InfoExtractor):
3327     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3328
3329     def _real_extract(self, url):
3330         mobj = re.match(self._VALID_URL, url)
3331         if mobj is None:
3332             raise ExtractorError(u'invalid URL: %s' % url)
3333
3334         video_id = mobj.group('id')
3335         webpage = self._download_webpage(url, video_id)
3336
3337         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3338             webpage, u'video URL', flags=re.DOTALL)
3339
3340         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3341             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3342
3343         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3344             webpage, u'description', fatal=False, flags=re.DOTALL)
3345
3346         info = {
3347             'id': video_id,
3348             'url': video_url,
3349             'ext': 'mp4',
3350             'title': title,
3351             'description': video_description,
3352         }
3353         return [info]
3354
3355 class SteamIE(InfoExtractor):
3356     _VALID_URL = r"""http://store\.steampowered\.com/
3357                 (agecheck/)?
3358                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3359                 (?P<gameID>\d+)/?
3360                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3361                 """
3362     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3363     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3364
3365     @classmethod
3366     def suitable(cls, url):
3367         """Receives a URL and returns True if suitable for this IE."""
3368         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3369
3370     def _real_extract(self, url):
3371         m = re.match(self._VALID_URL, url, re.VERBOSE)
3372         gameID = m.group('gameID')
3373
3374         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3375         webpage = self._download_webpage(videourl, gameID)
3376
3377         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3378             videourl = self._AGECHECK_TEMPLATE % gameID
3379             self.report_age_confirmation()
3380             webpage = self._download_webpage(videourl, gameID)
3381
3382         self.report_extraction(gameID)
3383         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3384                                              webpage, 'game title')
3385
3386         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3387         mweb = re.finditer(urlRE, webpage)
3388         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3389         titles = re.finditer(namesRE, webpage)
3390         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3391         thumbs = re.finditer(thumbsRE, webpage)
3392         videos = []
3393         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3394             video_id = vid.group('videoID')
3395             title = vtitle.group('videoName')
3396             video_url = vid.group('videoURL')
3397             video_thumb = thumb.group('thumbnail')
3398             if not video_url:
3399                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3400             info = {
3401                 'id':video_id,
3402                 'url':video_url,
3403                 'ext': 'flv',
3404                 'title': unescapeHTML(title),
3405                 'thumbnail': video_thumb
3406                   }
3407             videos.append(info)
3408         return [self.playlist_result(videos, gameID, game_title)]
3409
3410 class UstreamIE(InfoExtractor):
3411     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3412     IE_NAME = u'ustream'
3413
3414     def _real_extract(self, url):
3415         m = re.match(self._VALID_URL, url)
3416         video_id = m.group('videoID')
3417
3418         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3419         webpage = self._download_webpage(url, video_id)
3420
3421         self.report_extraction(video_id)
3422
3423         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3424             webpage, u'title')
3425
3426         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3427             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3428
3429         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3430             webpage, u'thumbnail', fatal=False)
3431
3432         info = {
3433                 'id': video_id,
3434                 'url': video_url,
3435                 'ext': 'flv',
3436                 'title': video_title,
3437                 'uploader': uploader,
3438                 'thumbnail': thumbnail,
3439                }
3440         return info
3441
3442 class WorldStarHipHopIE(InfoExtractor):
3443     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3444     IE_NAME = u'WorldStarHipHop'
3445
3446     def _real_extract(self, url):
3447         m = re.match(self._VALID_URL, url)
3448         video_id = m.group('id')
3449
3450         webpage_src = self._download_webpage(url, video_id)
3451
3452         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3453             webpage_src, u'video URL')
3454
3455         if 'mp4' in video_url:
3456             ext = 'mp4'
3457         else:
3458             ext = 'flv'
3459
3460         video_title = self._html_search_regex(r"<title>(.*)</title>",
3461             webpage_src, u'title')
3462
3463         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3464         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3465             webpage_src, u'thumbnail', fatal=False)
3466
3467         if not thumbnail:
3468             _title = r"""candytitles.*>(.*)</span>"""
3469             mobj = re.search(_title, webpage_src)
3470             if mobj is not None:
3471                 video_title = mobj.group(1)
3472
3473         results = [{
3474                     'id': video_id,
3475                     'url' : video_url,
3476                     'title' : video_title,
3477                     'thumbnail' : thumbnail,
3478                     'ext' : ext,
3479                     }]
3480         return results
3481
3482 class RBMARadioIE(InfoExtractor):
3483     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3484
3485     def _real_extract(self, url):
3486         m = re.match(self._VALID_URL, url)
3487         video_id = m.group('videoID')
3488
3489         webpage = self._download_webpage(url, video_id)
3490
3491         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3492             webpage, u'json data', flags=re.MULTILINE)
3493
3494         try:
3495             data = json.loads(json_data)
3496         except ValueError as e:
3497             raise ExtractorError(u'Invalid JSON: ' + str(e))
3498
3499         video_url = data['akamai_url'] + '&cbr=256'
3500         url_parts = compat_urllib_parse_urlparse(video_url)
3501         video_ext = url_parts.path.rpartition('.')[2]
3502         info = {
3503                 'id': video_id,
3504                 'url': video_url,
3505                 'ext': video_ext,
3506                 'title': data['title'],
3507                 'description': data.get('teaser_text'),
3508                 'location': data.get('country_of_origin'),
3509                 'uploader': data.get('host', {}).get('name'),
3510                 'uploader_id': data.get('host', {}).get('slug'),
3511                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3512                 'duration': data.get('duration'),
3513         }
3514         return [info]
3515
3516
3517 class YouPornIE(InfoExtractor):
3518     """Information extractor for youporn.com."""
3519     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3520
3521     def _print_formats(self, formats):
3522         """Print all available formats"""
3523         print(u'Available formats:')
3524         print(u'ext\t\tformat')
3525         print(u'---------------------------------')
3526         for format in formats:
3527             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3528
3529     def _specific(self, req_format, formats):
3530         for x in formats:
3531             if(x["format"]==req_format):
3532                 return x
3533         return None
3534
3535     def _real_extract(self, url):
3536         mobj = re.match(self._VALID_URL, url)
3537         if mobj is None:
3538             raise ExtractorError(u'Invalid URL: %s' % url)
3539         video_id = mobj.group('videoid')
3540
3541         req = compat_urllib_request.Request(url)
3542         req.add_header('Cookie', 'age_verified=1')
3543         webpage = self._download_webpage(req, video_id)
3544
3545         # Get JSON parameters
3546         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3547         try:
3548             params = json.loads(json_params)
3549         except:
3550             raise ExtractorError(u'Invalid JSON')
3551
3552         self.report_extraction(video_id)
3553         try:
3554             video_title = params['title']
3555             upload_date = unified_strdate(params['release_date_f'])
3556             video_description = params['description']
3557             video_uploader = params['submitted_by']
3558             thumbnail = params['thumbnails'][0]['image']
3559         except KeyError:
3560             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3561
3562         # Get all of the formats available
3563         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3564         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3565             webpage, u'download list').strip()
3566
3567         # Get all of the links from the page
3568         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3569         links = re.findall(LINK_RE, download_list_html)
3570         if(len(links) == 0):
3571             raise ExtractorError(u'ERROR: no known formats available for video')
3572
3573         self.to_screen(u'Links found: %d' % len(links))
3574
3575         formats = []
3576         for link in links:
3577
3578             # A link looks like this:
3579             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3580             # A path looks like this:
3581             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3582             video_url = unescapeHTML( link )
3583             path = compat_urllib_parse_urlparse( video_url ).path
3584             extension = os.path.splitext( path )[1][1:]
3585             format = path.split('/')[4].split('_')[:2]
3586             size = format[0]
3587             bitrate = format[1]
3588             format = "-".join( format )
3589             # title = u'%s-%s-%s' % (video_title, size, bitrate)
3590
3591             formats.append({
3592                 'id': video_id,
3593                 'url': video_url,
3594                 'uploader': video_uploader,
3595                 'upload_date': upload_date,
3596                 'title': video_title,
3597                 'ext': extension,
3598                 'format': format,
3599                 'thumbnail': thumbnail,
3600                 'description': video_description
3601             })
3602
3603         if self._downloader.params.get('listformats', None):
3604             self._print_formats(formats)
3605             return
3606
3607         req_format = self._downloader.params.get('format', None)
3608         self.to_screen(u'Format: %s' % req_format)
3609
3610         if req_format is None or req_format == 'best':
3611             return [formats[0]]
3612         elif req_format == 'worst':
3613             return [formats[-1]]
3614         elif req_format in ('-1', 'all'):
3615             return formats
3616         else:
3617             format = self._specific( req_format, formats )
3618             if result is None:
3619                 raise ExtractorError(u'Requested format not available')
3620             return [format]
3621
3622
3623
3624 class PornotubeIE(InfoExtractor):
3625     """Information extractor for pornotube.com."""
3626     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3627
3628     def _real_extract(self, url):
3629         mobj = re.match(self._VALID_URL, url)
3630         if mobj is None:
3631             raise ExtractorError(u'Invalid URL: %s' % url)
3632
3633         video_id = mobj.group('videoid')
3634         video_title = mobj.group('title')
3635
3636         # Get webpage content
3637         webpage = self._download_webpage(url, video_id)
3638
3639         # Get the video URL
3640         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3641         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3642         video_url = compat_urllib_parse.unquote(video_url)
3643
3644         #Get the uploaded date
3645         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3646         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3647         if upload_date: upload_date = unified_strdate(upload_date)
3648
3649         info = {'id': video_id,
3650                 'url': video_url,
3651                 'uploader': None,
3652                 'upload_date': upload_date,
3653                 'title': video_title,
3654                 'ext': 'flv',
3655                 'format': 'flv'}
3656
3657         return [info]
3658
3659 class YouJizzIE(InfoExtractor):
3660     """Information extractor for youjizz.com."""
3661     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3662
3663     def _real_extract(self, url):
3664         mobj = re.match(self._VALID_URL, url)
3665         if mobj is None:
3666             raise ExtractorError(u'Invalid URL: %s' % url)
3667
3668         video_id = mobj.group('videoid')
3669
3670         # Get webpage content
3671         webpage = self._download_webpage(url, video_id)
3672
3673         # Get the video title
3674         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3675             webpage, u'title').strip()
3676
3677         # Get the embed page
3678         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3679         if result is None:
3680             raise ExtractorError(u'ERROR: unable to extract embed page')
3681
3682         embed_page_url = result.group(0).strip()
3683         video_id = result.group('videoid')
3684
3685         webpage = self._download_webpage(embed_page_url, video_id)
3686
3687         # Get the video URL
3688         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3689             webpage, u'video URL')
3690
3691         info = {'id': video_id,
3692                 'url': video_url,
3693                 'title': video_title,
3694                 'ext': 'flv',
3695                 'format': 'flv',
3696                 'player_url': embed_page_url}
3697
3698         return [info]
3699
3700 class EightTracksIE(InfoExtractor):
3701     IE_NAME = '8tracks'
3702     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3703
3704     def _real_extract(self, url):
3705         mobj = re.match(self._VALID_URL, url)
3706         if mobj is None:
3707             raise ExtractorError(u'Invalid URL: %s' % url)
3708         playlist_id = mobj.group('id')
3709
3710         webpage = self._download_webpage(url, playlist_id)
3711
3712         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3713         data = json.loads(json_like)
3714
3715         session = str(random.randint(0, 1000000000))
3716         mix_id = data['id']
3717         track_count = data['tracks_count']
3718         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3719         next_url = first_url
3720         res = []
3721         for i in itertools.count():
3722             api_json = self._download_webpage(next_url, playlist_id,
3723                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3724                 errnote=u'Failed to download song information')
3725             api_data = json.loads(api_json)
3726             track_data = api_data[u'set']['track']
3727             info = {
3728                 'id': track_data['id'],
3729                 'url': track_data['track_file_stream_url'],
3730                 'title': track_data['performer'] + u' - ' + track_data['name'],
3731                 'raw_title': track_data['name'],
3732                 'uploader_id': data['user']['login'],
3733                 'ext': 'm4a',
3734             }
3735             res.append(info)
3736             if api_data['set']['at_last_track']:
3737                 break
3738             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3739         return res
3740
3741 class KeekIE(InfoExtractor):
3742     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3743     IE_NAME = u'keek'
3744
3745     def _real_extract(self, url):
3746         m = re.match(self._VALID_URL, url)
3747         video_id = m.group('videoID')
3748
3749         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3750         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3751         webpage = self._download_webpage(url, video_id)
3752
3753         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3754             webpage, u'title')
3755
3756         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3757             webpage, u'uploader', fatal=False)
3758
3759         info = {
3760                 'id': video_id,
3761                 'url': video_url,
3762                 'ext': 'mp4',
3763                 'title': video_title,
3764                 'thumbnail': thumbnail,
3765                 'uploader': uploader
3766         }
3767         return [info]
3768
3769 class TEDIE(InfoExtractor):
3770     _VALID_URL=r'''http://www\.ted\.com/
3771                    (
3772                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3773                         |
3774                         ((?P<type_talk>talks)) # We have a simple talk
3775                    )
3776                    (/lang/(.*?))? # The url may contain the language
3777                    /(?P<name>\w+) # Here goes the name and then ".html"
3778                    '''
3779
3780     @classmethod
3781     def suitable(cls, url):
3782         """Receives a URL and returns True if suitable for this IE."""
3783         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3784
3785     def _real_extract(self, url):
3786         m=re.match(self._VALID_URL, url, re.VERBOSE)
3787         if m.group('type_talk'):
3788             return [self._talk_info(url)]
3789         else :
3790             playlist_id=m.group('playlist_id')
3791             name=m.group('name')
3792             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3793             return [self._playlist_videos_info(url,name,playlist_id)]
3794
3795     def _talk_video_link(self,mediaSlug):
3796         '''Returns the video link for that mediaSlug'''
3797         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3798
3799     def _playlist_videos_info(self,url,name,playlist_id=0):
3800         '''Returns the videos of the playlist'''
3801         video_RE=r'''
3802                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3803                      ([.\s]*?)data-playlist_item_id="(\d+)"
3804                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3805                      '''
3806         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3807         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3808         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3809         m_names=re.finditer(video_name_RE,webpage)
3810
3811         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3812         m_playlist = re.search(playlist_RE, webpage)
3813         playlist_title = m_playlist.group('playlist_title')
3814
3815         playlist_entries = []
3816         for m_video, m_name in zip(m_videos,m_names):
3817             video_id=m_video.group('video_id')
3818             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3819             playlist_entries.append(self.url_result(talk_url, 'TED'))
3820         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3821
3822     def _talk_info(self, url, video_id=0):
3823         """Return the video for the talk in the url"""
3824         m=re.match(self._VALID_URL, url,re.VERBOSE)
3825         videoName=m.group('name')
3826         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3827         # If the url includes the language we get the title translated
3828         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3829         title=re.search(title_RE, webpage).group('title')
3830         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3831                         "id":(?P<videoID>[\d]+).*?
3832                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3833         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3834         thumb_match=re.search(thumb_RE,webpage)
3835         info_match=re.search(info_RE,webpage,re.VERBOSE)
3836         video_id=info_match.group('videoID')
3837         mediaSlug=info_match.group('mediaSlug')
3838         video_url=self._talk_video_link(mediaSlug)
3839         info = {
3840                 'id': video_id,
3841                 'url': video_url,
3842                 'ext': 'mp4',
3843                 'title': title,
3844                 'thumbnail': thumb_match.group('thumbnail')
3845                 }
3846         return info
3847
3848 class MySpassIE(InfoExtractor):
3849     _VALID_URL = r'http://www.myspass.de/.*'
3850
3851     def _real_extract(self, url):
3852         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3853
3854         # video id is the last path element of the URL
3855         # usually there is a trailing slash, so also try the second but last
3856         url_path = compat_urllib_parse_urlparse(url).path
3857         url_parent_path, video_id = os.path.split(url_path)
3858         if not video_id:
3859             _, video_id = os.path.split(url_parent_path)
3860
3861         # get metadata
3862         metadata_url = META_DATA_URL_TEMPLATE % video_id
3863         metadata_text = self._download_webpage(metadata_url, video_id)
3864         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3865
3866         # extract values from metadata
3867         url_flv_el = metadata.find('url_flv')
3868         if url_flv_el is None:
3869             raise ExtractorError(u'Unable to extract download url')
3870         video_url = url_flv_el.text
3871         extension = os.path.splitext(video_url)[1][1:]
3872         title_el = metadata.find('title')
3873         if title_el is None:
3874             raise ExtractorError(u'Unable to extract title')
3875         title = title_el.text
3876         format_id_el = metadata.find('format_id')
3877         if format_id_el is None:
3878             format = ext
3879         else:
3880             format = format_id_el.text
3881         description_el = metadata.find('description')
3882         if description_el is not None:
3883             description = description_el.text
3884         else:
3885             description = None
3886         imagePreview_el = metadata.find('imagePreview')
3887         if imagePreview_el is not None:
3888             thumbnail = imagePreview_el.text
3889         else:
3890             thumbnail = None
3891         info = {
3892             'id': video_id,
3893             'url': video_url,
3894             'title': title,
3895             'ext': extension,
3896             'format': format,
3897             'thumbnail': thumbnail,
3898             'description': description
3899         }
3900         return [info]
3901
3902 class SpiegelIE(InfoExtractor):
3903     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3904
3905     def _real_extract(self, url):
3906         m = re.match(self._VALID_URL, url)
3907         video_id = m.group('videoID')
3908
3909         webpage = self._download_webpage(url, video_id)
3910
3911         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3912             webpage, u'title')
3913
3914         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3915         xml_code = self._download_webpage(xml_url, video_id,
3916                     note=u'Downloading XML', errnote=u'Failed to download XML')
3917
3918         idoc = xml.etree.ElementTree.fromstring(xml_code)
3919         last_type = idoc[-1]
3920         filename = last_type.findall('./filename')[0].text
3921         duration = float(last_type.findall('./duration')[0].text)
3922
3923         video_url = 'http://video2.spiegel.de/flash/' + filename
3924         video_ext = filename.rpartition('.')[2]
3925         info = {
3926             'id': video_id,
3927             'url': video_url,
3928             'ext': video_ext,
3929             'title': video_title,
3930             'duration': duration,
3931         }
3932         return [info]
3933
3934 class LiveLeakIE(InfoExtractor):
3935
3936     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3937     IE_NAME = u'liveleak'
3938
3939     def _real_extract(self, url):
3940         mobj = re.match(self._VALID_URL, url)
3941         if mobj is None:
3942             raise ExtractorError(u'Invalid URL: %s' % url)
3943
3944         video_id = mobj.group('video_id')
3945
3946         webpage = self._download_webpage(url, video_id)
3947
3948         video_url = self._search_regex(r'file: "(.*?)",',
3949             webpage, u'video URL')
3950
3951         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3952             webpage, u'title').replace('LiveLeak.com -', '').strip()
3953
3954         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3955             webpage, u'description', fatal=False)
3956
3957         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3958             webpage, u'uploader', fatal=False)
3959
3960         info = {
3961             'id':  video_id,
3962             'url': video_url,
3963             'ext': 'mp4',
3964             'title': video_title,
3965             'description': video_description,
3966             'uploader': video_uploader
3967         }
3968
3969         return [info]
3970
3971 class ARDIE(InfoExtractor):
3972     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3973     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3974     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3975
3976     def _real_extract(self, url):
3977         # determine video id from url
3978         m = re.match(self._VALID_URL, url)
3979
3980         numid = re.search(r'documentId=([0-9]+)', url)
3981         if numid:
3982             video_id = numid.group(1)
3983         else:
3984             video_id = m.group('video_id')
3985
3986         # determine title and media streams from webpage
3987         html = self._download_webpage(url, video_id)
3988         title = re.search(self._TITLE, html).group('title')
3989         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3990         if not streams:
3991             assert '"fsk"' in html
3992             raise ExtractorError(u'This video is only available after 8:00 pm')
3993
3994         # choose default media type and highest quality for now
3995         stream = max([s for s in streams if int(s["media_type"]) == 0],
3996                      key=lambda s: int(s["quality"]))
3997
3998         # there's two possibilities: RTMP stream or HTTP download
3999         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4000         if stream['rtmp_url']:
4001             self.to_screen(u'RTMP download detected')
4002             assert stream['video_url'].startswith('mp4:')
4003             info["url"] = stream["rtmp_url"]
4004             info["play_path"] = stream['video_url']
4005         else:
4006             assert stream["video_url"].endswith('.mp4')
4007             info["url"] = stream["video_url"]
4008         return [info]
4009
4010 class ZDFIE(InfoExtractor):
4011     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4012     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4013     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4014     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4015     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4016
4017     def _real_extract(self, url):
4018         mobj = re.match(self._VALID_URL, url)
4019         if mobj is None:
4020             raise ExtractorError(u'Invalid URL: %s' % url)
4021         video_id = mobj.group('video_id')
4022
4023         html = self._download_webpage(url, video_id)
4024         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4025         if streams is None:
4026             raise ExtractorError(u'No media url found.')
4027
4028         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4029         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4030         # choose first/default media type and highest quality for now
4031         for s in streams:        #find 300 - dsl1000mbit
4032             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4033                 stream_=s
4034                 break
4035         for s in streams:        #find veryhigh - dsl2000mbit
4036             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4037                 stream_=s
4038                 break
4039         if stream_ is None:
4040             raise ExtractorError(u'No stream found.')
4041
4042         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4043
4044         self.report_extraction(video_id)
4045         mobj = re.search(self._TITLE, html)
4046         if mobj is None:
4047             raise ExtractorError(u'Cannot extract title')
4048         title = unescapeHTML(mobj.group('title'))
4049
4050         mobj = re.search(self._MMS_STREAM, media_link)
4051         if mobj is None:
4052             mobj = re.search(self._RTSP_STREAM, media_link)
4053             if mobj is None:
4054                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4055         mms_url = mobj.group('video_url')
4056
4057         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4058         if mobj is None:
4059             raise ExtractorError(u'Cannot extract extention')
4060         ext = mobj.group('ext')
4061
4062         return [{'id': video_id,
4063                  'url': mms_url,
4064                  'title': title,
4065                  'ext': ext
4066                  }]
4067
4068 class TumblrIE(InfoExtractor):
4069     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4070
4071     def _real_extract(self, url):
4072         m_url = re.match(self._VALID_URL, url)
4073         video_id = m_url.group('id')
4074         blog = m_url.group('blog_name')
4075
4076         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4077         webpage = self._download_webpage(url, video_id)
4078
4079         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4080         video = re.search(re_video, webpage)
4081         if video is None:
4082            raise ExtractorError(u'Unable to extract video')
4083         video_url = video.group('video_url')
4084         ext = video.group('ext')
4085
4086         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4087             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4088         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4089
4090         # The only place where you can get a title, it's not complete,
4091         # but searching in other places doesn't work for all videos
4092         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4093             webpage, u'title', flags=re.DOTALL)
4094
4095         return [{'id': video_id,
4096                  'url': video_url,
4097                  'title': video_title,
4098                  'thumbnail': video_thumbnail,
4099                  'ext': ext
4100                  }]
4101
4102 class BandcampIE(InfoExtractor):
4103     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4104
4105     def _real_extract(self, url):
4106         mobj = re.match(self._VALID_URL, url)
4107         title = mobj.group('title')
4108         webpage = self._download_webpage(url, title)
4109         # We get the link to the free download page
4110         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4111         if m_download is None:
4112             raise ExtractorError(u'No free songs found')
4113
4114         download_link = m_download.group(1)
4115         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4116                        webpage, re.MULTILINE|re.DOTALL).group('id')
4117
4118         download_webpage = self._download_webpage(download_link, id,
4119                                                   'Downloading free downloads page')
4120         # We get the dictionary of the track from some javascrip code
4121         info = re.search(r'items: (.*?),$',
4122                          download_webpage, re.MULTILINE).group(1)
4123         info = json.loads(info)[0]
4124         # We pick mp3-320 for now, until format selection can be easily implemented.
4125         mp3_info = info[u'downloads'][u'mp3-320']
4126         # If we try to use this url it says the link has expired
4127         initial_url = mp3_info[u'url']
4128         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4129         m_url = re.match(re_url, initial_url)
4130         #We build the url we will use to get the final track url
4131         # This url is build in Bandcamp in the script download_bunde_*.js
4132         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4133         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4134         # If we could correctly generate the .rand field the url would be
4135         #in the "download_url" key
4136         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4137
4138         track_info = {'id':id,
4139                       'title' : info[u'title'],
4140                       'ext' :   'mp3',
4141                       'url' :   final_url,
4142                       'thumbnail' : info[u'thumb_url'],
4143                       'uploader' :  info[u'artist']
4144                       }
4145
4146         return [track_info]
4147
4148 class RedTubeIE(InfoExtractor):
4149     """Information Extractor for redtube"""
4150     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4151
4152     def _real_extract(self,url):
4153         mobj = re.match(self._VALID_URL, url)
4154         if mobj is None:
4155             raise ExtractorError(u'Invalid URL: %s' % url)
4156
4157         video_id = mobj.group('id')
4158         video_extension = 'mp4'
4159         webpage = self._download_webpage(url, video_id)
4160
4161         self.report_extraction(video_id)
4162
4163         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4164             webpage, u'video URL')
4165
4166         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4167             webpage, u'title')
4168
4169         return [{
4170             'id':       video_id,
4171             'url':      video_url,
4172             'ext':      video_extension,
4173             'title':    video_title,
4174         }]
4175
4176 class InaIE(InfoExtractor):
4177     """Information Extractor for Ina.fr"""
4178     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4179
4180     def _real_extract(self,url):
4181         mobj = re.match(self._VALID_URL, url)
4182
4183         video_id = mobj.group('id')
4184         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4185         video_extension = 'mp4'
4186         webpage = self._download_webpage(mrss_url, video_id)
4187
4188         self.report_extraction(video_id)
4189
4190         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4191             webpage, u'video URL')
4192
4193         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4194             webpage, u'title')
4195
4196         return [{
4197             'id':       video_id,
4198             'url':      video_url,
4199             'ext':      video_extension,
4200             'title':    video_title,
4201         }]
4202
4203 class HowcastIE(InfoExtractor):
4204     """Information Extractor for Howcast.com"""
4205     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4206
4207     def _real_extract(self, url):
4208         mobj = re.match(self._VALID_URL, url)
4209
4210         video_id = mobj.group('id')
4211         webpage_url = 'http://www.howcast.com/videos/' + video_id
4212         webpage = self._download_webpage(webpage_url, video_id)
4213
4214         self.report_extraction(video_id)
4215
4216         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4217             webpage, u'video URL')
4218
4219         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4220             webpage, u'title')
4221
4222         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4223             webpage, u'description', fatal=False)
4224
4225         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4226             webpage, u'thumbnail', fatal=False)
4227
4228         return [{
4229             'id':       video_id,
4230             'url':      video_url,
4231             'ext':      'mp4',
4232             'title':    video_title,
4233             'description': video_description,
4234             'thumbnail': thumbnail,
4235         }]
4236
4237 class VineIE(InfoExtractor):
4238     """Information Extractor for Vine.co"""
4239     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4240
4241     def _real_extract(self, url):
4242         mobj = re.match(self._VALID_URL, url)
4243
4244         video_id = mobj.group('id')
4245         webpage_url = 'https://vine.co/v/' + video_id
4246         webpage = self._download_webpage(webpage_url, video_id)
4247
4248         self.report_extraction(video_id)
4249
4250         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4251             webpage, u'video URL')
4252
4253         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4254             webpage, u'title')
4255
4256         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4257             webpage, u'thumbnail', fatal=False)
4258
4259         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4260             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4261
4262         return [{
4263             'id':        video_id,
4264             'url':       video_url,
4265             'ext':       'mp4',
4266             'title':     video_title,
4267             'thumbnail': thumbnail,
4268             'uploader':  uploader,
4269         }]
4270
4271 class FlickrIE(InfoExtractor):
4272     """Information Extractor for Flickr videos"""
4273     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4274
4275     def _real_extract(self, url):
4276         mobj = re.match(self._VALID_URL, url)
4277
4278         video_id = mobj.group('id')
4279         video_uploader_id = mobj.group('uploader_id')
4280         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4281         webpage = self._download_webpage(webpage_url, video_id)
4282
4283         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4284
4285         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4286         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4287
4288         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4289             first_xml, u'node_id')
4290
4291         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4292         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4293
4294         self.report_extraction(video_id)
4295
4296         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4297         if mobj is None:
4298             raise ExtractorError(u'Unable to extract video url')
4299         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4300
4301         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4302             webpage, u'video title')
4303
4304         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4305             webpage, u'description', fatal=False)
4306
4307         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4308             webpage, u'thumbnail', fatal=False)
4309
4310         return [{
4311             'id':          video_id,
4312             'url':         video_url,
4313             'ext':         'mp4',
4314             'title':       video_title,
4315             'description': video_description,
4316             'thumbnail':   thumbnail,
4317             'uploader_id': video_uploader_id,
4318         }]
4319
4320 class TeamcocoIE(InfoExtractor):
4321     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4322
4323     def _real_extract(self, url):
4324         mobj = re.match(self._VALID_URL, url)
4325         if mobj is None:
4326             raise ExtractorError(u'Invalid URL: %s' % url)
4327         url_title = mobj.group('url_title')
4328         webpage = self._download_webpage(url, url_title)
4329
4330         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4331             webpage, u'video id')
4332
4333         self.report_extraction(video_id)
4334
4335         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4336             webpage, u'title')
4337
4338         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4339             webpage, u'thumbnail', fatal=False)
4340
4341         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4342             webpage, u'description', fatal=False)
4343
4344         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4345         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4346
4347         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4348             data, u'video URL')
4349
4350         return [{
4351             'id':          video_id,
4352             'url':         video_url,
4353             'ext':         'mp4',
4354             'title':       video_title,
4355             'thumbnail':   thumbnail,
4356             'description': video_description,
4357         }]
4358
4359 class XHamsterIE(InfoExtractor):
4360     """Information Extractor for xHamster"""
4361     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4362
4363     def _real_extract(self,url):
4364         mobj = re.match(self._VALID_URL, url)
4365
4366         video_id = mobj.group('id')
4367         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4368         webpage = self._download_webpage(mrss_url, video_id)
4369
4370         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4371         if mobj is None:
4372             raise ExtractorError(u'Unable to extract media URL')
4373         if len(mobj.group('server')) == 0:
4374             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4375         else:
4376             video_url = mobj.group('server')+'/key='+mobj.group('file')
4377         video_extension = video_url.split('.')[-1]
4378
4379         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4380             webpage, u'title')
4381
4382         # Can't see the description anywhere in the UI
4383         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4384         #     webpage, u'description', fatal=False)
4385         # if video_description: video_description = unescapeHTML(video_description)
4386
4387         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4388         if mobj:
4389             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4390         else:
4391             video_upload_date = None
4392             self._downloader.report_warning(u'Unable to extract upload date')
4393
4394         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4395             webpage, u'uploader id', default=u'anonymous')
4396
4397         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4398             webpage, u'thumbnail', fatal=False)
4399
4400         return [{
4401             'id':       video_id,
4402             'url':      video_url,
4403             'ext':      video_extension,
4404             'title':    video_title,
4405             # 'description': video_description,
4406             'upload_date': video_upload_date,
4407             'uploader_id': video_uploader_id,
4408             'thumbnail': video_thumbnail
4409         }]
4410
4411 class HypemIE(InfoExtractor):
4412     """Information Extractor for hypem"""
4413     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4414
4415     def _real_extract(self, url):
4416         mobj = re.match(self._VALID_URL, url)
4417         if mobj is None:
4418             raise ExtractorError(u'Invalid URL: %s' % url)
4419         track_id = mobj.group(1)
4420
4421         data = { 'ax': 1, 'ts': time.time() }
4422         data_encoded = compat_urllib_parse.urlencode(data)
4423         complete_url = url + "?" + data_encoded
4424         request = compat_urllib_request.Request(complete_url)
4425         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4426         cookie = urlh.headers.get('Set-Cookie', '')
4427
4428         self.report_extraction(track_id)
4429
4430         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4431             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4432         try:
4433             track_list = json.loads(html_tracks)
4434             track = track_list[u'tracks'][0]
4435         except ValueError:
4436             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4437
4438         key = track[u"key"]
4439         track_id = track[u"id"]
4440         artist = track[u"artist"]
4441         title = track[u"song"]
4442
4443         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4444         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4445         request.add_header('cookie', cookie)
4446         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4447         try:
4448             song_data = json.loads(song_data_json)
4449         except ValueError:
4450             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4451         final_url = song_data[u"url"]
4452
4453         return [{
4454             'id':       track_id,
4455             'url':      final_url,
4456             'ext':      "mp3",
4457             'title':    title,
4458             'artist':   artist,
4459         }]
4460
4461 class Vbox7IE(InfoExtractor):
4462     """Information Extractor for Vbox7"""
4463     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4464
4465     def _real_extract(self,url):
4466         mobj = re.match(self._VALID_URL, url)
4467         if mobj is None:
4468             raise ExtractorError(u'Invalid URL: %s' % url)
4469         video_id = mobj.group(1)
4470
4471         redirect_page, urlh = self._download_webpage_handle(url, video_id)
4472         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4473         redirect_url = urlh.geturl() + new_location
4474         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4475
4476         title = self._html_search_regex(r'<title>(.*)</title>',
4477             webpage, u'title').split('/')[0].strip()
4478
4479         ext = "flv"
4480         info_url = "http://vbox7.com/play/magare.do"
4481         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4482         info_request = compat_urllib_request.Request(info_url, data)
4483         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4484         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4485         if info_response is None:
4486             raise ExtractorError(u'Unable to extract the media url')
4487         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4488
4489         return [{
4490             'id':        video_id,
4491             'url':       final_url,
4492             'ext':       ext,
4493             'title':     title,
4494             'thumbnail': thumbnail_url,
4495         }]
4496
4497 class GametrailersIE(InfoExtractor):
4498     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4499
4500     def _real_extract(self, url):
4501         mobj = re.match(self._VALID_URL, url)
4502         if mobj is None:
4503             raise ExtractorError(u'Invalid URL: %s' % url)
4504         video_id = mobj.group('id')
4505         video_type = mobj.group('type')
4506         webpage = self._download_webpage(url, video_id)
4507         if video_type == 'full-episodes':
4508             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4509         else:
4510             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4511         mgid = self._search_regex(mgid_re, webpage, u'mgid')
4512         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4513
4514         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4515                                            video_id, u'Downloading video info')
4516         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4517                                                video_id, u'Downloading video urls info')
4518
4519         self.report_extraction(video_id)
4520         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4521                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4522                       <image>.*
4523                         <url>(?P<thumb>.*?)</url>.*
4524                       </image>'''
4525
4526         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4527         if m_info is None:
4528             raise ExtractorError(u'Unable to extract video info')
4529         video_title = m_info.group('title')
4530         video_description = m_info.group('description')
4531         video_thumb = m_info.group('thumb')
4532
4533         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4534         if m_urls is None or len(m_urls) == 0:
4535             raise ExtractError(u'Unable to extrat video url')
4536         # They are sorted from worst to best quality
4537         video_url = m_urls[-1].group('url')
4538
4539         return {'url':         video_url,
4540                 'id':          video_id,
4541                 'title':       video_title,
4542                 # Videos are actually flv not mp4
4543                 'ext':         'flv',
4544                 'thumbnail':   video_thumb,
4545                 'description': video_description,
4546                 }
4547
4548 def gen_extractors():
4549     """ Return a list of an instance of every supported extractor.
4550     The order does matter; the first extractor matched is the one handling the URL.
4551     """
4552     return [
4553         YoutubePlaylistIE(),
4554         YoutubeChannelIE(),
4555         YoutubeUserIE(),
4556         YoutubeSearchIE(),
4557         YoutubeIE(),
4558         MetacafeIE(),
4559         DailymotionIE(),
4560         GoogleSearchIE(),
4561         PhotobucketIE(),
4562         YahooIE(),
4563         YahooSearchIE(),
4564         DepositFilesIE(),
4565         FacebookIE(),
4566         BlipTVIE(),
4567         BlipTVUserIE(),
4568         VimeoIE(),
4569         MyVideoIE(),
4570         ComedyCentralIE(),
4571         EscapistIE(),
4572         CollegeHumorIE(),
4573         XVideosIE(),
4574         SoundcloudSetIE(),
4575         SoundcloudIE(),
4576         InfoQIE(),
4577         MixcloudIE(),
4578         StanfordOpenClassroomIE(),
4579         MTVIE(),
4580         YoukuIE(),
4581         XNXXIE(),
4582         YouJizzIE(),
4583         PornotubeIE(),
4584         YouPornIE(),
4585         GooglePlusIE(),
4586         ArteTvIE(),
4587         NBAIE(),
4588         WorldStarHipHopIE(),
4589         JustinTVIE(),
4590         FunnyOrDieIE(),
4591         SteamIE(),
4592         UstreamIE(),
4593         RBMARadioIE(),
4594         EightTracksIE(),
4595         KeekIE(),
4596         TEDIE(),
4597         MySpassIE(),
4598         SpiegelIE(),
4599         LiveLeakIE(),
4600         ARDIE(),
4601         ZDFIE(),
4602         TumblrIE(),
4603         BandcampIE(),
4604         RedTubeIE(),
4605         InaIE(),
4606         HowcastIE(),
4607         VineIE(),
4608         FlickrIE(),
4609         TeamcocoIE(),
4610         XHamsterIE(),
4611         HypemIE(),
4612         Vbox7IE(),
4613         GametrailersIE(),
4614         GenericIE()
4615     ]
4616
4617 def get_info_extractor(ie_name):
4618     """Returns the info extractor class with the given ie_name"""
4619     return globals()[ie_name+'IE']