youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 195         """
 196         Perform a regex search on the given string, using a single or a list of
 197         patterns returning the first matching group.
 198         In case of failure return a default value or raise a WARNING or a
 199         ExtractorError, depending on fatal, specifying the field name.
 200         """
 201         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 202             mobj = re.search(pattern, string, flags)
 203         else:
 204             for p in pattern:
 205                 mobj = re.search(p, string, flags)
 206                 if mobj: break
 207
 208         if sys.stderr.isatty() and os.name != 'nt':
 209             _name = u'\033[0;34m%s\033[0m' % name
 210         else:
 211             _name = name
 212
 213         if mobj:
 214             # return the first matching group
 215             return next(g for g in mobj.groups() if g is not None)
 216         elif default is not None:
 217             return default
 218         elif fatal:
 219             raise ExtractorError(u'Unable to extract %s' % _name)
 220         else:
 221             self._downloader.report_warning(u'unable to extract %s; '
 222                 u'please report this issue on GitHub.' % _name)
 223             return None
 224
 225     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 226         """
 227         Like _search_regex, but strips HTML tags and unescapes entities.
 228         """
 229         res = self._search_regex(pattern, string, name, default, fatal, flags)
 230         if res:
 231             return clean_html(res).strip()
 232         else:
 233             return res
 234
 235 class SearchInfoExtractor(InfoExtractor):
 236     """
 237     Base class for paged search queries extractors.
 238     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 239     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 240     """
 241
 242     @classmethod
 243     def _make_valid_url(cls):
 244         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 245
 246     @classmethod
 247     def suitable(cls, url):
 248         return re.match(cls._make_valid_url(), url) is not None
 249
 250     def _real_extract(self, query):
 251         mobj = re.match(self._make_valid_url(), query)
 252         if mobj is None:
 253             raise ExtractorError(u'Invalid search query "%s"' % query)
 254
 255         prefix = mobj.group('prefix')
 256         query = mobj.group('query')
 257         if prefix == '':
 258             return self._get_n_results(query, 1)
 259         elif prefix == 'all':
 260             return self._get_n_results(query, self._MAX_RESULTS)
 261         else:
 262             n = int(prefix)
 263             if n <= 0:
 264                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 265             elif n > self._MAX_RESULTS:
 266                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 267                 n = self._MAX_RESULTS
 268             return self._get_n_results(query, n)
 269
 270     def _get_n_results(self, query, n):
 271         """Get a specified number of results for a query"""
 272         raise NotImplementedError("This method must be implemented by sublclasses")
 273
 274
 275 class YoutubeIE(InfoExtractor):
 276     """Information extractor for youtube.com."""
 277
 278     _VALID_URL = r"""^
 279                      (
 280                          (?:https?://)?                                       # http(s):// (optional)
 281                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 282                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 283                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 284                          (?:                                                  # the various things that can precede the ID:
 285                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 286                              |(?:                                             # or the v= param in all its forms
 287                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 288                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 289                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 290                                  v=
 291                              )
 292                          )?                                                   # optional -> youtube.com/xxxx is OK
 293                      )?                                                       # all until now is optional -> you can pass the naked ID
 294                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 295                      (?(1).+)?                                                # if we found the ID, everything can follow
 296                      $"""
 297     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 298     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 299     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 300     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 301     _NETRC_MACHINE = 'youtube'
 302     # Listed in order of quality
 303     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 304     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 305     _video_extensions = {
 306         '13': '3gp',
 307         '17': 'mp4',
 308         '18': 'mp4',
 309         '22': 'mp4',
 310         '37': 'mp4',
 311         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 312         '43': 'webm',
 313         '44': 'webm',
 314         '45': 'webm',
 315         '46': 'webm',
 316     }
 317     _video_dimensions = {
 318         '5': '240x400',
 319         '6': '???',
 320         '13': '???',
 321         '17': '144x176',
 322         '18': '360x640',
 323         '22': '720x1280',
 324         '34': '360x640',
 325         '35': '480x854',
 326         '37': '1080x1920',
 327         '38': '3072x4096',
 328         '43': '360x640',
 329         '44': '480x854',
 330         '45': '720x1280',
 331         '46': '1080x1920',
 332     }
 333     IE_NAME = u'youtube'
 334
 335     @classmethod
 336     def suitable(cls, url):
 337         """Receives a URL and returns True if suitable for this IE."""
 338         if YoutubePlaylistIE.suitable(url): return False
 339         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 340
 341     def report_lang(self):
 342         """Report attempt to set language."""
 343         self.to_screen(u'Setting language')
 344
 345     def report_login(self):
 346         """Report attempt to log in."""
 347         self.to_screen(u'Logging in')
 348
 349     def report_video_webpage_download(self, video_id):
 350         """Report attempt to download video webpage."""
 351         self.to_screen(u'%s: Downloading video webpage' % video_id)
 352
 353     def report_video_info_webpage_download(self, video_id):
 354         """Report attempt to download video info webpage."""
 355         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 356
 357     def report_video_subtitles_download(self, video_id):
 358         """Report attempt to download video info webpage."""
 359         self.to_screen(u'%s: Checking available subtitles' % video_id)
 360
 361     def report_video_subtitles_request(self, video_id, sub_lang, format):
 362         """Report attempt to download video info webpage."""
 363         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 364
 365     def report_video_subtitles_available(self, video_id, sub_lang_list):
 366         """Report available subtitles."""
 367         sub_lang = ",".join(list(sub_lang_list.keys()))
 368         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 369
 370     def report_information_extraction(self, video_id):
 371         """Report attempt to extract video information."""
 372         self.to_screen(u'%s: Extracting video information' % video_id)
 373
 374     def report_unavailable_format(self, video_id, format):
 375         """Report extracted video URL."""
 376         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 377
 378     def report_rtmp_download(self):
 379         """Indicate the download will use the RTMP protocol."""
 380         self.to_screen(u'RTMP download detected')
 381
 382     def _get_available_subtitles(self, video_id):
 383         self.report_video_subtitles_download(video_id)
 384         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 385         try:
 386             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 388             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 389         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 390         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 391         if not sub_lang_list:
 392             return (u'video doesn\'t have subtitles', None)
 393         return sub_lang_list
 394
 395     def _list_available_subtitles(self, video_id):
 396         sub_lang_list = self._get_available_subtitles(video_id)
 397         self.report_video_subtitles_available(video_id, sub_lang_list)
 398
 399     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 400         """
 401         Return tuple:
 402         (error_message, sub_lang, sub)
 403         """
 404         self.report_video_subtitles_request(video_id, sub_lang, format)
 405         params = compat_urllib_parse.urlencode({
 406             'lang': sub_lang,
 407             'name': sub_name,
 408             'v': video_id,
 409             'fmt': format,
 410         })
 411         url = 'http://www.youtube.com/api/timedtext?' + params
 412         try:
 413             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 416         if not sub:
 417             return (u'Did not fetch video subtitles', None, None)
 418         return (None, sub_lang, sub)
 419
 420     def _request_automatic_caption(self, video_id, webpage):
 421         """We need the webpage for getting the captions url, pass it as an
 422            argument to speed up the process."""
 423         sub_lang = self._downloader.params.get('subtitleslang')
 424         sub_format = self._downloader.params.get('subtitlesformat')
 425         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 426         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 427         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 428         if mobj is None:
 429             return [(err_msg, None, None)]
 430         player_config = json.loads(mobj.group(1))
 431         try:
 432             args = player_config[u'args']
 433             caption_url = args[u'ttsurl']
 434             timestamp = args[u'timestamp']
 435             params = compat_urllib_parse.urlencode({
 436                 'lang': 'en',
 437                 'tlang': sub_lang,
 438                 'fmt': sub_format,
 439                 'ts': timestamp,
 440                 'kind': 'asr',
 441             })
 442             subtitles_url = caption_url + '&' + params
 443             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 444             return [(None, sub_lang, sub)]
 445         except KeyError:
 446             return [(err_msg, None, None)]
 447
 448     def _extract_subtitle(self, video_id):
 449         """
 450         Return a list with a tuple:
 451         [(error_message, sub_lang, sub)]
 452         """
 453         sub_lang_list = self._get_available_subtitles(video_id)
 454         sub_format = self._downloader.params.get('subtitlesformat')
 455         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 456             return [(sub_lang_list[0], None, None)]
 457         if self._downloader.params.get('subtitleslang', False):
 458             sub_lang = self._downloader.params.get('subtitleslang')
 459         elif 'en' in sub_lang_list:
 460             sub_lang = 'en'
 461         else:
 462             sub_lang = list(sub_lang_list.keys())[0]
 463         if not sub_lang in sub_lang_list:
 464             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 465
 466         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 467         return [subtitle]
 468
 469     def _extract_all_subtitles(self, video_id):
 470         sub_lang_list = self._get_available_subtitles(video_id)
 471         sub_format = self._downloader.params.get('subtitlesformat')
 472         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 473             return [(sub_lang_list[0], None, None)]
 474         subtitles = []
 475         for sub_lang in sub_lang_list:
 476             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 477             subtitles.append(subtitle)
 478         return subtitles
 479
 480     def _print_formats(self, formats):
 481         print('Available formats:')
 482         for x in formats:
 483             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 484
 485     def _real_initialize(self):
 486         if self._downloader is None:
 487             return
 488
 489         username = None
 490         password = None
 491         downloader_params = self._downloader.params
 492
 493         # Attempt to use provided username and password or .netrc data
 494         if downloader_params.get('username', None) is not None:
 495             username = downloader_params['username']
 496             password = downloader_params['password']
 497         elif downloader_params.get('usenetrc', False):
 498             try:
 499                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 500                 if info is not None:
 501                     username = info[0]
 502                     password = info[2]
 503                 else:
 504                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 505             except (IOError, netrc.NetrcParseError) as err:
 506                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 507                 return
 508
 509         # Set language
 510         request = compat_urllib_request.Request(self._LANG_URL)
 511         try:
 512             self.report_lang()
 513             compat_urllib_request.urlopen(request).read()
 514         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 515             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 516             return
 517
 518         # No authentication to be performed
 519         if username is None:
 520             return
 521
 522         request = compat_urllib_request.Request(self._LOGIN_URL)
 523         try:
 524             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 525         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 526             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 527             return
 528
 529         galx = None
 530         dsh = None
 531         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 532         if match:
 533           galx = match.group(1)
 534
 535         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 536         if match:
 537           dsh = match.group(1)
 538
 539         # Log in
 540         login_form_strs = {
 541                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 542                 u'Email': username,
 543                 u'GALX': galx,
 544                 u'Passwd': password,
 545                 u'PersistentCookie': u'yes',
 546                 u'_utf8': u'霱',
 547                 u'bgresponse': u'js_disabled',
 548                 u'checkConnection': u'',
 549                 u'checkedDomains': u'youtube',
 550                 u'dnConn': u'',
 551                 u'dsh': dsh,
 552                 u'pstMsg': u'0',
 553                 u'rmShown': u'1',
 554                 u'secTok': u'',
 555                 u'signIn': u'Sign in',
 556                 u'timeStmp': u'',
 557                 u'service': u'youtube',
 558                 u'uilel': u'3',
 559                 u'hl': u'en_US',
 560         }
 561         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 562         # chokes on unicode
 563         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 564         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 565         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 566         try:
 567             self.report_login()
 568             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 569             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 570                 self._downloader.report_warning(u'unable to log in: bad username or password')
 571                 return
 572         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 573             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 574             return
 575
 576         # Confirm age
 577         age_form = {
 578                 'next_url':     '/',
 579                 'action_confirm':   'Confirm',
 580                 }
 581         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 582         try:
 583             self.report_age_confirmation()
 584             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 585         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 586             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 587
 588     def _extract_id(self, url):
 589         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 590         if mobj is None:
 591             raise ExtractorError(u'Invalid URL: %s' % url)
 592         video_id = mobj.group(2)
 593         return video_id
 594
 595     def _real_extract(self, url):
 596         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 597         mobj = re.search(self._NEXT_URL_RE, url)
 598         if mobj:
 599             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 600         video_id = self._extract_id(url)
 601
 602         # Get video webpage
 603         self.report_video_webpage_download(video_id)
 604         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 605         request = compat_urllib_request.Request(url)
 606         try:
 607             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 608         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 609             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 610
 611         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 612
 613         # Attempt to extract SWF player URL
 614         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 615         if mobj is not None:
 616             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 617         else:
 618             player_url = None
 619
 620         # Get video info
 621         self.report_video_info_webpage_download(video_id)
 622         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 623             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 624                     % (video_id, el_type))
 625             video_info_webpage = self._download_webpage(video_info_url, video_id,
 626                                     note=False,
 627                                     errnote='unable to download video info webpage')
 628             video_info = compat_parse_qs(video_info_webpage)
 629             if 'token' in video_info:
 630                 break
 631         if 'token' not in video_info:
 632             if 'reason' in video_info:
 633                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 634             else:
 635                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 636
 637         # Check for "rental" videos
 638         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 639             raise ExtractorError(u'"rental" videos not supported')
 640
 641         # Start extracting information
 642         self.report_information_extraction(video_id)
 643
 644         # uploader
 645         if 'author' not in video_info:
 646             raise ExtractorError(u'Unable to extract uploader name')
 647         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 648
 649         # uploader_id
 650         video_uploader_id = None
 651         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 652         if mobj is not None:
 653             video_uploader_id = mobj.group(1)
 654         else:
 655             self._downloader.report_warning(u'unable to extract uploader nickname')
 656
 657         # title
 658         if 'title' not in video_info:
 659             raise ExtractorError(u'Unable to extract video title')
 660         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 661
 662         # thumbnail image
 663         if 'thumbnail_url' not in video_info:
 664             self._downloader.report_warning(u'unable to extract video thumbnail')
 665             video_thumbnail = ''
 666         else:   # don't panic if we can't find it
 667             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 668
 669         # upload date
 670         upload_date = None
 671         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 672         if mobj is not None:
 673             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 674             upload_date = unified_strdate(upload_date)
 675
 676         # description
 677         video_description = get_element_by_id("eow-description", video_webpage)
 678         if video_description:
 679             video_description = clean_html(video_description)
 680         else:
 681             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 682             if fd_mobj:
 683                 video_description = unescapeHTML(fd_mobj.group(1))
 684             else:
 685                 video_description = u''
 686
 687         # subtitles
 688         video_subtitles = None
 689
 690         if self._downloader.params.get('writesubtitles', False):
 691             video_subtitles = self._extract_subtitle(video_id)
 692             if video_subtitles:
 693                 (sub_error, sub_lang, sub) = video_subtitles[0]
 694                 if sub_error:
 695                     # We try with the automatic captions
 696                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 697                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 698                     if sub is not None:
 699                         pass
 700                     else:
 701                         # We report the original error
 702                         self._downloader.report_error(sub_error)
 703
 704         if self._downloader.params.get('allsubtitles', False):
 705             video_subtitles = self._extract_all_subtitles(video_id)
 706             for video_subtitle in video_subtitles:
 707                 (sub_error, sub_lang, sub) = video_subtitle
 708                 if sub_error:
 709                     self._downloader.report_error(sub_error)
 710
 711         if self._downloader.params.get('listsubtitles', False):
 712             sub_lang_list = self._list_available_subtitles(video_id)
 713             return
 714
 715         if 'length_seconds' not in video_info:
 716             self._downloader.report_warning(u'unable to extract video duration')
 717             video_duration = ''
 718         else:
 719             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 720
 721         # token
 722         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 723
 724         # Decide which formats to download
 725         req_format = self._downloader.params.get('format', None)
 726
 727         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 728             self.report_rtmp_download()
 729             video_url_list = [(None, video_info['conn'][0])]
 730         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 731             url_map = {}
 732             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 733                 url_data = compat_parse_qs(url_data_str)
 734                 if 'itag' in url_data and 'url' in url_data:
 735                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 736                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 737                     url_map[url_data['itag'][0]] = url
 738
 739             format_limit = self._downloader.params.get('format_limit', None)
 740             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 741             if format_limit is not None and format_limit in available_formats:
 742                 format_list = available_formats[available_formats.index(format_limit):]
 743             else:
 744                 format_list = available_formats
 745             existing_formats = [x for x in format_list if x in url_map]
 746             if len(existing_formats) == 0:
 747                 raise ExtractorError(u'no known formats available for video')
 748             if self._downloader.params.get('listformats', None):
 749                 self._print_formats(existing_formats)
 750                 return
 751             if req_format is None or req_format == 'best':
 752                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 753             elif req_format == 'worst':
 754                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 755             elif req_format in ('-1', 'all'):
 756                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 757             else:
 758                 # Specific formats. We pick the first in a slash-delimeted sequence.
 759                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 760                 req_formats = req_format.split('/')
 761                 video_url_list = None
 762                 for rf in req_formats:
 763                     if rf in url_map:
 764                         video_url_list = [(rf, url_map[rf])]
 765                         break
 766                 if video_url_list is None:
 767                     raise ExtractorError(u'requested format not available')
 768         else:
 769             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 770
 771         results = []
 772         for format_param, video_real_url in video_url_list:
 773             # Extension
 774             video_extension = self._video_extensions.get(format_param, 'flv')
 775
 776             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 777                                               self._video_dimensions.get(format_param, '???'))
 778
 779             results.append({
 780                 'id':       video_id,
 781                 'url':      video_real_url,
 782                 'uploader': video_uploader,
 783                 'uploader_id': video_uploader_id,
 784                 'upload_date':  upload_date,
 785                 'title':    video_title,
 786                 'ext':      video_extension,
 787                 'format':   video_format,
 788                 'thumbnail':    video_thumbnail,
 789                 'description':  video_description,
 790                 'player_url':   player_url,
 791                 'subtitles':    video_subtitles,
 792                 'duration':     video_duration
 793             })
 794         return results
 795
 796
 797 class MetacafeIE(InfoExtractor):
 798     """Information Extractor for metacafe.com."""
 799
 800     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 801     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 802     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 803     IE_NAME = u'metacafe'
 804
 805     def report_disclaimer(self):
 806         """Report disclaimer retrieval."""
 807         self.to_screen(u'Retrieving disclaimer')
 808
 809     def _real_initialize(self):
 810         # Retrieve disclaimer
 811         request = compat_urllib_request.Request(self._DISCLAIMER)
 812         try:
 813             self.report_disclaimer()
 814             disclaimer = compat_urllib_request.urlopen(request).read()
 815         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 816             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 817
 818         # Confirm age
 819         disclaimer_form = {
 820             'filters': '0',
 821             'submit': "Continue - I'm over 18",
 822             }
 823         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 824         try:
 825             self.report_age_confirmation()
 826             disclaimer = compat_urllib_request.urlopen(request).read()
 827         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 828             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 829
 830     def _real_extract(self, url):
 831         # Extract id and simplified title from URL
 832         mobj = re.match(self._VALID_URL, url)
 833         if mobj is None:
 834             raise ExtractorError(u'Invalid URL: %s' % url)
 835
 836         video_id = mobj.group(1)
 837
 838         # Check if video comes from YouTube
 839         mobj2 = re.match(r'^yt-(.*)$', video_id)
 840         if mobj2 is not None:
 841             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 842
 843         # Retrieve video webpage to extract further information
 844         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 845
 846         # Extract URL, uploader and title from webpage
 847         self.report_extraction(video_id)
 848         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 849         if mobj is not None:
 850             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 851             video_extension = mediaURL[-3:]
 852
 853             # Extract gdaKey if available
 854             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 855             if mobj is None:
 856                 video_url = mediaURL
 857             else:
 858                 gdaKey = mobj.group(1)
 859                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 860         else:
 861             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 862             if mobj is None:
 863                 raise ExtractorError(u'Unable to extract media URL')
 864             vardict = compat_parse_qs(mobj.group(1))
 865             if 'mediaData' not in vardict:
 866                 raise ExtractorError(u'Unable to extract media URL')
 867             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 868             if mobj is None:
 869                 raise ExtractorError(u'Unable to extract media URL')
 870             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 871             video_extension = mediaURL[-3:]
 872             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 873
 874         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 875         if mobj is None:
 876             raise ExtractorError(u'Unable to extract title')
 877         video_title = mobj.group(1).decode('utf-8')
 878
 879         mobj = re.search(r'submitter=(.*?);', webpage)
 880         if mobj is None:
 881             raise ExtractorError(u'Unable to extract uploader nickname')
 882         video_uploader = mobj.group(1)
 883
 884         return [{
 885             'id':       video_id.decode('utf-8'),
 886             'url':      video_url.decode('utf-8'),
 887             'uploader': video_uploader.decode('utf-8'),
 888             'upload_date':  None,
 889             'title':    video_title,
 890             'ext':      video_extension.decode('utf-8'),
 891         }]
 892
 893 class DailymotionIE(InfoExtractor):
 894     """Information Extractor for Dailymotion"""
 895
 896     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 897     IE_NAME = u'dailymotion'
 898
 899     def _real_extract(self, url):
 900         # Extract id and simplified title from URL
 901         mobj = re.match(self._VALID_URL, url)
 902         if mobj is None:
 903             raise ExtractorError(u'Invalid URL: %s' % url)
 904
 905         video_id = mobj.group(1).split('_')[0].split('?')[0]
 906
 907         video_extension = 'mp4'
 908
 909         # Retrieve video webpage to extract further information
 910         request = compat_urllib_request.Request(url)
 911         request.add_header('Cookie', 'family_filter=off')
 912         webpage = self._download_webpage(request, video_id)
 913
 914         # Extract URL, uploader and title from webpage
 915         self.report_extraction(video_id)
 916         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 917         if mobj is None:
 918             raise ExtractorError(u'Unable to extract media URL')
 919         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 920
 921         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 922             if key in flashvars:
 923                 max_quality = key
 924                 self.to_screen(u'Using %s' % key)
 925                 break
 926         else:
 927             raise ExtractorError(u'Unable to extract video URL')
 928
 929         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 930         if mobj is None:
 931             raise ExtractorError(u'Unable to extract video URL')
 932
 933         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 934
 935         # TODO: support choosing qualities
 936
 937         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 938         if mobj is None:
 939             raise ExtractorError(u'Unable to extract title')
 940         video_title = unescapeHTML(mobj.group('title'))
 941
 942         video_uploader = None
 943         video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
 944                                              # Looking for official user
 945                                              r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
 946                                             webpage, 'video uploader')
 947
 948         video_upload_date = None
 949         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 950         if mobj is not None:
 951             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 952
 953         return [{
 954             'id':       video_id,
 955             'url':      video_url,
 956             'uploader': video_uploader,
 957             'upload_date':  video_upload_date,
 958             'title':    video_title,
 959             'ext':      video_extension,
 960         }]
 961
 962
 963 class PhotobucketIE(InfoExtractor):
 964     """Information extractor for photobucket.com."""
 965
 966     # TODO: the original _VALID_URL was:
 967     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 968     # Check if it's necessary to keep the old extracion process
 969     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 970     IE_NAME = u'photobucket'
 971
 972     def _real_extract(self, url):
 973         # Extract id from URL
 974         mobj = re.match(self._VALID_URL, url)
 975         if mobj is None:
 976             raise ExtractorError(u'Invalid URL: %s' % url)
 977
 978         video_id = mobj.group('id')
 979
 980         video_extension = mobj.group('ext')
 981
 982         # Retrieve video webpage to extract further information
 983         webpage = self._download_webpage(url, video_id)
 984
 985         # Extract URL, uploader, and title from webpage
 986         self.report_extraction(video_id)
 987         # We try first by looking the javascript code:
 988         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 989         if mobj is not None:
 990             info = json.loads(mobj.group('json'))
 991             return [{
 992                 'id':       video_id,
 993                 'url':      info[u'downloadUrl'],
 994                 'uploader': info[u'username'],
 995                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 996                 'title':    info[u'title'],
 997                 'ext':      video_extension,
 998                 'thumbnail': info[u'thumbUrl'],
 999             }]
1000
1001         # We try looking in other parts of the webpage
1002         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1003             webpage, u'video URL')
1004
1005         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1006         if mobj is None:
1007             raise ExtractorError(u'Unable to extract title')
1008         video_title = mobj.group(1).decode('utf-8')
1009         video_uploader = mobj.group(2).decode('utf-8')
1010
1011         return [{
1012             'id':       video_id.decode('utf-8'),
1013             'url':      video_url.decode('utf-8'),
1014             'uploader': video_uploader,
1015             'upload_date':  None,
1016             'title':    video_title,
1017             'ext':      video_extension.decode('utf-8'),
1018         }]
1019
1020
1021 class YahooIE(InfoExtractor):
1022     """Information extractor for screen.yahoo.com."""
1023     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1024
1025     def _real_extract(self, url):
1026         mobj = re.match(self._VALID_URL, url)
1027         if mobj is None:
1028             raise ExtractorError(u'Invalid URL: %s' % url)
1029         video_id = mobj.group('id')
1030         webpage = self._download_webpage(url, video_id)
1031         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1032
1033         if m_id is None:
1034             # TODO: Check which url parameters are required
1035             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1036             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1037             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1038                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1039                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1040                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1041                         '''
1042             self.report_extraction(video_id)
1043             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1044             if m_info is None:
1045                 raise ExtractorError(u'Unable to extract video info')
1046             video_title = m_info.group('title')
1047             video_description = m_info.group('description')
1048             video_thumb = m_info.group('thumb')
1049             video_date = m_info.group('date')
1050             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1051
1052             # TODO: Find a way to get mp4 videos
1053             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1054             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1055             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1056             video_url = m_rest.group('url')
1057             video_path = m_rest.group('path')
1058             if m_rest is None:
1059                 raise ExtractorError(u'Unable to extract video url')
1060
1061         else: # We have to use a different method if another id is defined
1062             long_id = m_id.group('new_id')
1063             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1064             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1065             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1066             info = json.loads(json_str)
1067             res = info[u'query'][u'results'][u'mediaObj'][0]
1068             stream = res[u'streams'][0]
1069             video_path = stream[u'path']
1070             video_url = stream[u'host']
1071             meta = res[u'meta']
1072             video_title = meta[u'title']
1073             video_description = meta[u'description']
1074             video_thumb = meta[u'thumbnail']
1075             video_date = None # I can't find it
1076
1077         info_dict = {
1078                      'id': video_id,
1079                      'url': video_url,
1080                      'play_path': video_path,
1081                      'title':video_title,
1082                      'description': video_description,
1083                      'thumbnail': video_thumb,
1084                      'upload_date': video_date,
1085                      'ext': 'flv',
1086                      }
1087         return info_dict
1088
1089 class VimeoIE(InfoExtractor):
1090     """Information extractor for vimeo.com."""
1091
1092     # _VALID_URL matches Vimeo URLs
1093     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1094     IE_NAME = u'vimeo'
1095
1096     def _real_extract(self, url, new_video=True):
1097         # Extract ID from URL
1098         mobj = re.match(self._VALID_URL, url)
1099         if mobj is None:
1100             raise ExtractorError(u'Invalid URL: %s' % url)
1101
1102         video_id = mobj.group('id')
1103         if not mobj.group('proto'):
1104             url = 'https://' + url
1105         if mobj.group('direct_link') or mobj.group('pro'):
1106             url = 'https://vimeo.com/' + video_id
1107
1108         # Retrieve video webpage to extract further information
1109         request = compat_urllib_request.Request(url, None, std_headers)
1110         webpage = self._download_webpage(request, video_id)
1111
1112         # Now we begin extracting as much information as we can from what we
1113         # retrieved. First we extract the information common to all extractors,
1114         # and latter we extract those that are Vimeo specific.
1115         self.report_extraction(video_id)
1116
1117         # Extract the config JSON
1118         try:
1119             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1120             config = json.loads(config)
1121         except:
1122             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1123                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1124             else:
1125                 raise ExtractorError(u'Unable to extract info section')
1126
1127         # Extract title
1128         video_title = config["video"]["title"]
1129
1130         # Extract uploader and uploader_id
1131         video_uploader = config["video"]["owner"]["name"]
1132         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1133
1134         # Extract video thumbnail
1135         video_thumbnail = config["video"]["thumbnail"]
1136
1137         # Extract video description
1138         video_description = get_element_by_attribute("itemprop", "description", webpage)
1139         if video_description: video_description = clean_html(video_description)
1140         else: video_description = u''
1141
1142         # Extract upload date
1143         video_upload_date = None
1144         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1145         if mobj is not None:
1146             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1147
1148         # Vimeo specific: extract request signature and timestamp
1149         sig = config['request']['signature']
1150         timestamp = config['request']['timestamp']
1151
1152         # Vimeo specific: extract video codec and quality information
1153         # First consider quality, then codecs, then take everything
1154         # TODO bind to format param
1155         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1156         files = { 'hd': [], 'sd': [], 'other': []}
1157         for codec_name, codec_extension in codecs:
1158             if codec_name in config["video"]["files"]:
1159                 if 'hd' in config["video"]["files"][codec_name]:
1160                     files['hd'].append((codec_name, codec_extension, 'hd'))
1161                 elif 'sd' in config["video"]["files"][codec_name]:
1162                     files['sd'].append((codec_name, codec_extension, 'sd'))
1163                 else:
1164                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1165
1166         for quality in ('hd', 'sd', 'other'):
1167             if len(files[quality]) > 0:
1168                 video_quality = files[quality][0][2]
1169                 video_codec = files[quality][0][0]
1170                 video_extension = files[quality][0][1]
1171                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1172                 break
1173         else:
1174             raise ExtractorError(u'No known codec found')
1175
1176         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1177                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1178
1179         return [{
1180             'id':       video_id,
1181             'url':      video_url,
1182             'uploader': video_uploader,
1183             'uploader_id': video_uploader_id,
1184             'upload_date':  video_upload_date,
1185             'title':    video_title,
1186             'ext':      video_extension,
1187             'thumbnail':    video_thumbnail,
1188             'description':  video_description,
1189         }]
1190
1191
1192 class ArteTvIE(InfoExtractor):
1193     """arte.tv information extractor."""
1194
1195     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1196     _LIVE_URL = r'index-[0-9]+\.html$'
1197
1198     IE_NAME = u'arte.tv'
1199
1200     def fetch_webpage(self, url):
1201         request = compat_urllib_request.Request(url)
1202         try:
1203             self.report_download_webpage(url)
1204             webpage = compat_urllib_request.urlopen(request).read()
1205         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1206             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1207         except ValueError as err:
1208             raise ExtractorError(u'Invalid URL: %s' % url)
1209         return webpage
1210
1211     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1212         page = self.fetch_webpage(url)
1213         mobj = re.search(regex, page, regexFlags)
1214         info = {}
1215
1216         if mobj is None:
1217             raise ExtractorError(u'Invalid URL: %s' % url)
1218
1219         for (i, key, err) in matchTuples:
1220             if mobj.group(i) is None:
1221                 raise ExtractorError(err)
1222             else:
1223                 info[key] = mobj.group(i)
1224
1225         return info
1226
1227     def extractLiveStream(self, url):
1228         video_lang = url.split('/')[-4]
1229         info = self.grep_webpage(
1230             url,
1231             r'src="(.*?/videothek_js.*?\.js)',
1232             0,
1233             [
1234                 (1, 'url', u'Invalid URL: %s' % url)
1235             ]
1236         )
1237         http_host = url.split('/')[2]
1238         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1239         info = self.grep_webpage(
1240             next_url,
1241             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1242                 '(http://.*?\.swf).*?' +
1243                 '(rtmp://.*?)\'',
1244             re.DOTALL,
1245             [
1246                 (1, 'path',   u'could not extract video path: %s' % url),
1247                 (2, 'player', u'could not extract video player: %s' % url),
1248                 (3, 'url',    u'could not extract video url: %s' % url)
1249             ]
1250         )
1251         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1252
1253     def extractPlus7Stream(self, url):
1254         video_lang = url.split('/')[-3]
1255         info = self.grep_webpage(
1256             url,
1257             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1258             0,
1259             [
1260                 (1, 'url', u'Invalid URL: %s' % url)
1261             ]
1262         )
1263         next_url = compat_urllib_parse.unquote(info.get('url'))
1264         info = self.grep_webpage(
1265             next_url,
1266             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1267             0,
1268             [
1269                 (1, 'url', u'Could not find <video> tag: %s' % url)
1270             ]
1271         )
1272         next_url = compat_urllib_parse.unquote(info.get('url'))
1273
1274         info = self.grep_webpage(
1275             next_url,
1276             r'<video id="(.*?)".*?>.*?' +
1277                 '<name>(.*?)</name>.*?' +
1278                 '<dateVideo>(.*?)</dateVideo>.*?' +
1279                 '<url quality="hd">(.*?)</url>',
1280             re.DOTALL,
1281             [
1282                 (1, 'id',    u'could not extract video id: %s' % url),
1283                 (2, 'title', u'could not extract video title: %s' % url),
1284                 (3, 'date',  u'could not extract video date: %s' % url),
1285                 (4, 'url',   u'could not extract video url: %s' % url)
1286             ]
1287         )
1288
1289         return {
1290             'id':           info.get('id'),
1291             'url':          compat_urllib_parse.unquote(info.get('url')),
1292             'uploader':     u'arte.tv',
1293             'upload_date':  unified_strdate(info.get('date')),
1294             'title':        info.get('title').decode('utf-8'),
1295             'ext':          u'mp4',
1296             'format':       u'NA',
1297             'player_url':   None,
1298         }
1299
1300     def _real_extract(self, url):
1301         video_id = url.split('/')[-1]
1302         self.report_extraction(video_id)
1303
1304         if re.search(self._LIVE_URL, video_id) is not None:
1305             self.extractLiveStream(url)
1306             return
1307         else:
1308             info = self.extractPlus7Stream(url)
1309
1310         return [info]
1311
1312
1313 class GenericIE(InfoExtractor):
1314     """Generic last-resort information extractor."""
1315
1316     _VALID_URL = r'.*'
1317     IE_NAME = u'generic'
1318
1319     def report_download_webpage(self, video_id):
1320         """Report webpage download."""
1321         if not self._downloader.params.get('test', False):
1322             self._downloader.report_warning(u'Falling back on generic information extractor.')
1323         super(GenericIE, self).report_download_webpage(video_id)
1324
1325     def report_following_redirect(self, new_url):
1326         """Report information extraction."""
1327         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1328
1329     def _test_redirect(self, url):
1330         """Check if it is a redirect, like url shorteners, in case return the new url."""
1331         class HeadRequest(compat_urllib_request.Request):
1332             def get_method(self):
1333                 return "HEAD"
1334
1335         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1336             """
1337             Subclass the HTTPRedirectHandler to make it use our
1338             HeadRequest also on the redirected URL
1339             """
1340             def redirect_request(self, req, fp, code, msg, headers, newurl):
1341                 if code in (301, 302, 303, 307):
1342                     newurl = newurl.replace(' ', '%20')
1343                     newheaders = dict((k,v) for k,v in req.headers.items()
1344                                       if k.lower() not in ("content-length", "content-type"))
1345                     return HeadRequest(newurl,
1346                                        headers=newheaders,
1347                                        origin_req_host=req.get_origin_req_host(),
1348                                        unverifiable=True)
1349                 else:
1350                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1351
1352         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1353             """
1354             Fallback to GET if HEAD is not allowed (405 HTTP error)
1355             """
1356             def http_error_405(self, req, fp, code, msg, headers):
1357                 fp.read()
1358                 fp.close()
1359
1360                 newheaders = dict((k,v) for k,v in req.headers.items()
1361                                   if k.lower() not in ("content-length", "content-type"))
1362                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1363                                                  headers=newheaders,
1364                                                  origin_req_host=req.get_origin_req_host(),
1365                                                  unverifiable=True))
1366
1367         # Build our opener
1368         opener = compat_urllib_request.OpenerDirector()
1369         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1370                         HTTPMethodFallback, HEADRedirectHandler,
1371                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1372             opener.add_handler(handler())
1373
1374         response = opener.open(HeadRequest(url))
1375         if response is None:
1376             raise ExtractorError(u'Invalid URL protocol')
1377         new_url = response.geturl()
1378
1379         if url == new_url:
1380             return False
1381
1382         self.report_following_redirect(new_url)
1383         return new_url
1384
1385     def _real_extract(self, url):
1386         new_url = self._test_redirect(url)
1387         if new_url: return [self.url_result(new_url)]
1388
1389         video_id = url.split('/')[-1]
1390         try:
1391             webpage = self._download_webpage(url, video_id)
1392         except ValueError as err:
1393             # since this is the last-resort InfoExtractor, if
1394             # this error is thrown, it'll be thrown here
1395             raise ExtractorError(u'Invalid URL: %s' % url)
1396
1397         self.report_extraction(video_id)
1398         # Start with something easy: JW Player in SWFObject
1399         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1400         if mobj is None:
1401             # Broaden the search a little bit
1402             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1403         if mobj is None:
1404             # Broaden the search a little bit: JWPlayer JS loader
1405             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1406         if mobj is None:
1407             # Try to find twitter cards info
1408             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1409         if mobj is None:
1410             raise ExtractorError(u'Invalid URL: %s' % url)
1411
1412         # It's possible that one of the regexes
1413         # matched, but returned an empty group:
1414         if mobj.group(1) is None:
1415             raise ExtractorError(u'Invalid URL: %s' % url)
1416
1417         video_url = compat_urllib_parse.unquote(mobj.group(1))
1418         video_id = os.path.basename(video_url)
1419
1420         # here's a fun little line of code for you:
1421         video_extension = os.path.splitext(video_id)[1][1:]
1422         video_id = os.path.splitext(video_id)[0]
1423
1424         # it's tempting to parse this further, but you would
1425         # have to take into account all the variations like
1426         #   Video Title - Site Name
1427         #   Site Name | Video Title
1428         #   Video Title - Tagline | Site Name
1429         # and so on and so forth; it's just not practical
1430         video_title = self._html_search_regex(r'<title>(.*)</title>',
1431             webpage, u'video title')
1432
1433         # video uploader is domain name
1434         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1435             url, u'video uploader')
1436
1437         return [{
1438             'id':       video_id,
1439             'url':      video_url,
1440             'uploader': video_uploader,
1441             'upload_date':  None,
1442             'title':    video_title,
1443             'ext':      video_extension,
1444         }]
1445
1446
1447 class YoutubeSearchIE(SearchInfoExtractor):
1448     """Information Extractor for YouTube search queries."""
1449     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1450     _MAX_RESULTS = 1000
1451     IE_NAME = u'youtube:search'
1452     _SEARCH_KEY = 'ytsearch'
1453
1454     def report_download_page(self, query, pagenum):
1455         """Report attempt to download search page with given number."""
1456         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1457
1458     def _get_n_results(self, query, n):
1459         """Get a specified number of results for a query"""
1460
1461         video_ids = []
1462         pagenum = 0
1463         limit = n
1464
1465         while (50 * pagenum) < limit:
1466             self.report_download_page(query, pagenum+1)
1467             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1468             request = compat_urllib_request.Request(result_url)
1469             try:
1470                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1471             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1472                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1473             api_response = json.loads(data)['data']
1474
1475             if not 'items' in api_response:
1476                 raise ExtractorError(u'[youtube] No video results')
1477
1478             new_ids = list(video['id'] for video in api_response['items'])
1479             video_ids += new_ids
1480
1481             limit = min(n, api_response['totalItems'])
1482             pagenum += 1
1483
1484         if len(video_ids) > n:
1485             video_ids = video_ids[:n]
1486         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1487         return self.playlist_result(videos, query)
1488
1489
1490 class GoogleSearchIE(SearchInfoExtractor):
1491     """Information Extractor for Google Video search queries."""
1492     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1493     _MAX_RESULTS = 1000
1494     IE_NAME = u'video.google:search'
1495     _SEARCH_KEY = 'gvsearch'
1496
1497     def _get_n_results(self, query, n):
1498         """Get a specified number of results for a query"""
1499
1500         res = {
1501             '_type': 'playlist',
1502             'id': query,
1503             'entries': []
1504         }
1505
1506         for pagenum in itertools.count(1):
1507             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1508             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1509                                              note='Downloading result page ' + str(pagenum))
1510
1511             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1512                 e = {
1513                     '_type': 'url',
1514                     'url': mobj.group(1)
1515                 }
1516                 res['entries'].append(e)
1517
1518             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1519                 return res
1520
1521 class YahooSearchIE(SearchInfoExtractor):
1522     """Information Extractor for Yahoo! Video search queries."""
1523
1524     _MAX_RESULTS = 1000
1525     IE_NAME = u'screen.yahoo:search'
1526     _SEARCH_KEY = 'yvsearch'
1527
1528     def _get_n_results(self, query, n):
1529         """Get a specified number of results for a query"""
1530
1531         res = {
1532             '_type': 'playlist',
1533             'id': query,
1534             'entries': []
1535         }
1536         for pagenum in itertools.count(0):
1537             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1538             webpage = self._download_webpage(result_url, query,
1539                                              note='Downloading results page '+str(pagenum+1))
1540             info = json.loads(webpage)
1541             m = info[u'm']
1542             results = info[u'results']
1543
1544             for (i, r) in enumerate(results):
1545                 if (pagenum * 30) +i >= n:
1546                     break
1547                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1548                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1549                 res['entries'].append(e)
1550             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1551                 break
1552
1553         return res
1554
1555
1556 class YoutubePlaylistIE(InfoExtractor):
1557     """Information Extractor for YouTube playlists."""
1558
1559     _VALID_URL = r"""(?:
1560                         (?:https?://)?
1561                         (?:\w+\.)?
1562                         youtube\.com/
1563                         (?:
1564                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1565                            \? (?:.*?&)*? (?:p|a|list)=
1566                         |  p/
1567                         )
1568                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1569                         .*
1570                      |
1571                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1572                      )"""
1573     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1574     _MAX_RESULTS = 50
1575     IE_NAME = u'youtube:playlist'
1576
1577     @classmethod
1578     def suitable(cls, url):
1579         """Receives a URL and returns True if suitable for this IE."""
1580         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1581
1582     def _real_extract(self, url):
1583         # Extract playlist id
1584         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1585         if mobj is None:
1586             raise ExtractorError(u'Invalid URL: %s' % url)
1587
1588         # Download playlist videos from API
1589         playlist_id = mobj.group(1) or mobj.group(2)
1590         page_num = 1
1591         videos = []
1592
1593         while True:
1594             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1595             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1596
1597             try:
1598                 response = json.loads(page)
1599             except ValueError as err:
1600                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1601
1602             if 'feed' not in response:
1603                 raise ExtractorError(u'Got a malformed response from YouTube API')
1604             playlist_title = response['feed']['title']['$t']
1605             if 'entry' not in response['feed']:
1606                 # Number of videos is a multiple of self._MAX_RESULTS
1607                 break
1608
1609             for entry in response['feed']['entry']:
1610                 index = entry['yt$position']['$t']
1611                 if 'media$group' in entry and 'media$player' in entry['media$group']:
1612                     videos.append((index, entry['media$group']['media$player']['url']))
1613
1614             if len(response['feed']['entry']) < self._MAX_RESULTS:
1615                 break
1616             page_num += 1
1617
1618         videos = [v[1] for v in sorted(videos)]
1619
1620         url_results = [self.url_result(url, 'Youtube') for url in videos]
1621         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1622
1623
1624 class YoutubeChannelIE(InfoExtractor):
1625     """Information Extractor for YouTube channels."""
1626
1627     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1628     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1629     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1630     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1631     IE_NAME = u'youtube:channel'
1632
1633     def extract_videos_from_page(self, page):
1634         ids_in_page = []
1635         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1636             if mobj.group(1) not in ids_in_page:
1637                 ids_in_page.append(mobj.group(1))
1638         return ids_in_page
1639
1640     def _real_extract(self, url):
1641         # Extract channel id
1642         mobj = re.match(self._VALID_URL, url)
1643         if mobj is None:
1644             raise ExtractorError(u'Invalid URL: %s' % url)
1645
1646         # Download channel page
1647         channel_id = mobj.group(1)
1648         video_ids = []
1649         pagenum = 1
1650
1651         url = self._TEMPLATE_URL % (channel_id, pagenum)
1652         page = self._download_webpage(url, channel_id,
1653                                       u'Downloading page #%s' % pagenum)
1654
1655         # Extract video identifiers
1656         ids_in_page = self.extract_videos_from_page(page)
1657         video_ids.extend(ids_in_page)
1658
1659         # Download any subsequent channel pages using the json-based channel_ajax query
1660         if self._MORE_PAGES_INDICATOR in page:
1661             while True:
1662                 pagenum = pagenum + 1
1663
1664                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1665                 page = self._download_webpage(url, channel_id,
1666                                               u'Downloading page #%s' % pagenum)
1667
1668                 page = json.loads(page)
1669
1670                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1671                 video_ids.extend(ids_in_page)
1672
1673                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1674                     break
1675
1676         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1677
1678         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1679         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1680         return [self.playlist_result(url_entries, channel_id)]
1681
1682
1683 class YoutubeUserIE(InfoExtractor):
1684     """Information Extractor for YouTube users."""
1685
1686     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1687     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1688     _GDATA_PAGE_SIZE = 50
1689     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1690     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1691     IE_NAME = u'youtube:user'
1692
1693     def _real_extract(self, url):
1694         # Extract username
1695         mobj = re.match(self._VALID_URL, url)
1696         if mobj is None:
1697             raise ExtractorError(u'Invalid URL: %s' % url)
1698
1699         username = mobj.group(1)
1700
1701         # Download video ids using YouTube Data API. Result size per
1702         # query is limited (currently to 50 videos) so we need to query
1703         # page by page until there are no video ids - it means we got
1704         # all of them.
1705
1706         video_ids = []
1707         pagenum = 0
1708
1709         while True:
1710             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1711
1712             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1713             page = self._download_webpage(gdata_url, username,
1714                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1715
1716             # Extract video identifiers
1717             ids_in_page = []
1718
1719             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1720                 if mobj.group(1) not in ids_in_page:
1721                     ids_in_page.append(mobj.group(1))
1722
1723             video_ids.extend(ids_in_page)
1724
1725             # A little optimization - if current page is not
1726             # "full", ie. does not contain PAGE_SIZE video ids then
1727             # we can assume that this page is the last one - there
1728             # are no more ids on further pages - no need to query
1729             # again.
1730
1731             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1732                 break
1733
1734             pagenum += 1
1735
1736         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1737         url_results = [self.url_result(url, 'Youtube') for url in urls]
1738         return [self.playlist_result(url_results, playlist_title = username)]
1739
1740
1741 class BlipTVUserIE(InfoExtractor):
1742     """Information Extractor for blip.tv users."""
1743
1744     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1745     _PAGE_SIZE = 12
1746     IE_NAME = u'blip.tv:user'
1747
1748     def _real_extract(self, url):
1749         # Extract username
1750         mobj = re.match(self._VALID_URL, url)
1751         if mobj is None:
1752             raise ExtractorError(u'Invalid URL: %s' % url)
1753
1754         username = mobj.group(1)
1755
1756         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1757
1758         page = self._download_webpage(url, username, u'Downloading user page')
1759         mobj = re.search(r'data-users-id="([^"]+)"', page)
1760         page_base = page_base % mobj.group(1)
1761
1762
1763         # Download video ids using BlipTV Ajax calls. Result size per
1764         # query is limited (currently to 12 videos) so we need to query
1765         # page by page until there are no video ids - it means we got
1766         # all of them.
1767
1768         video_ids = []
1769         pagenum = 1
1770
1771         while True:
1772             url = page_base + "&page=" + str(pagenum)
1773             page = self._download_webpage(url, username,
1774                                           u'Downloading video ids from page %d' % pagenum)
1775
1776             # Extract video identifiers
1777             ids_in_page = []
1778
1779             for mobj in re.finditer(r'href="/([^"]+)"', page):
1780                 if mobj.group(1) not in ids_in_page:
1781                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1782
1783             video_ids.extend(ids_in_page)
1784
1785             # A little optimization - if current page is not
1786             # "full", ie. does not contain PAGE_SIZE video ids then
1787             # we can assume that this page is the last one - there
1788             # are no more ids on further pages - no need to query
1789             # again.
1790
1791             if len(ids_in_page) < self._PAGE_SIZE:
1792                 break
1793
1794             pagenum += 1
1795
1796         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1797         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1798         return [self.playlist_result(url_entries, playlist_title = username)]
1799
1800
1801 class DepositFilesIE(InfoExtractor):
1802     """Information extractor for depositfiles.com"""
1803
1804     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1805
1806     def _real_extract(self, url):
1807         file_id = url.split('/')[-1]
1808         # Rebuild url in english locale
1809         url = 'http://depositfiles.com/en/files/' + file_id
1810
1811         # Retrieve file webpage with 'Free download' button pressed
1812         free_download_indication = { 'gateway_result' : '1' }
1813         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1814         try:
1815             self.report_download_webpage(file_id)
1816             webpage = compat_urllib_request.urlopen(request).read()
1817         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1818             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1819
1820         # Search for the real file URL
1821         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1822         if (mobj is None) or (mobj.group(1) is None):
1823             # Try to figure out reason of the error.
1824             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1825             if (mobj is not None) and (mobj.group(1) is not None):
1826                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1827                 raise ExtractorError(u'%s' % restriction_message)
1828             else:
1829                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1830
1831         file_url = mobj.group(1)
1832         file_extension = os.path.splitext(file_url)[1][1:]
1833
1834         # Search for file title
1835         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1836
1837         return [{
1838             'id':       file_id.decode('utf-8'),
1839             'url':      file_url.decode('utf-8'),
1840             'uploader': None,
1841             'upload_date':  None,
1842             'title':    file_title,
1843             'ext':      file_extension.decode('utf-8'),
1844         }]
1845
1846
1847 class FacebookIE(InfoExtractor):
1848     """Information Extractor for Facebook"""
1849
1850     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1851     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1852     _NETRC_MACHINE = 'facebook'
1853     IE_NAME = u'facebook'
1854
1855     def report_login(self):
1856         """Report attempt to log in."""
1857         self.to_screen(u'Logging in')
1858
1859     def _real_initialize(self):
1860         if self._downloader is None:
1861             return
1862
1863         useremail = None
1864         password = None
1865         downloader_params = self._downloader.params
1866
1867         # Attempt to use provided username and password or .netrc data
1868         if downloader_params.get('username', None) is not None:
1869             useremail = downloader_params['username']
1870             password = downloader_params['password']
1871         elif downloader_params.get('usenetrc', False):
1872             try:
1873                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1874                 if info is not None:
1875                     useremail = info[0]
1876                     password = info[2]
1877                 else:
1878                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1879             except (IOError, netrc.NetrcParseError) as err:
1880                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1881                 return
1882
1883         if useremail is None:
1884             return
1885
1886         # Log in
1887         login_form = {
1888             'email': useremail,
1889             'pass': password,
1890             'login': 'Log+In'
1891             }
1892         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1893         try:
1894             self.report_login()
1895             login_results = compat_urllib_request.urlopen(request).read()
1896             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1897                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1898                 return
1899         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1900             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1901             return
1902
1903     def _real_extract(self, url):
1904         mobj = re.match(self._VALID_URL, url)
1905         if mobj is None:
1906             raise ExtractorError(u'Invalid URL: %s' % url)
1907         video_id = mobj.group('ID')
1908
1909         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1910         webpage = self._download_webpage(url, video_id)
1911
1912         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1913         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1914         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1915         if not m:
1916             raise ExtractorError(u'Cannot parse data')
1917         data = dict(json.loads(m.group(1)))
1918         params_raw = compat_urllib_parse.unquote(data['params'])
1919         params = json.loads(params_raw)
1920         video_data = params['video_data'][0]
1921         video_url = video_data.get('hd_src')
1922         if not video_url:
1923             video_url = video_data['sd_src']
1924         if not video_url:
1925             raise ExtractorError(u'Cannot find video URL')
1926         video_duration = int(video_data['video_duration'])
1927         thumbnail = video_data['thumbnail_src']
1928
1929         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1930             webpage, u'title')
1931
1932         info = {
1933             'id': video_id,
1934             'title': video_title,
1935             'url': video_url,
1936             'ext': 'mp4',
1937             'duration': video_duration,
1938             'thumbnail': thumbnail,
1939         }
1940         return [info]
1941
1942
1943 class BlipTVIE(InfoExtractor):
1944     """Information extractor for blip.tv"""
1945
1946     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1947     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1948     IE_NAME = u'blip.tv'
1949
1950     def report_direct_download(self, title):
1951         """Report information extraction."""
1952         self.to_screen(u'%s: Direct download detected' % title)
1953
1954     def _real_extract(self, url):
1955         mobj = re.match(self._VALID_URL, url)
1956         if mobj is None:
1957             raise ExtractorError(u'Invalid URL: %s' % url)
1958
1959         # See https://github.com/rg3/youtube-dl/issues/857
1960         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1961         if api_mobj is not None:
1962             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1963         urlp = compat_urllib_parse_urlparse(url)
1964         if urlp.path.startswith('/play/'):
1965             request = compat_urllib_request.Request(url)
1966             response = compat_urllib_request.urlopen(request)
1967             redirecturl = response.geturl()
1968             rurlp = compat_urllib_parse_urlparse(redirecturl)
1969             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1970             url = 'http://blip.tv/a/a-' + file_id
1971             return self._real_extract(url)
1972
1973
1974         if '?' in url:
1975             cchar = '&'
1976         else:
1977             cchar = '?'
1978         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1979         request = compat_urllib_request.Request(json_url)
1980         request.add_header('User-Agent', 'iTunes/10.6.1')
1981         self.report_extraction(mobj.group(1))
1982         info = None
1983         try:
1984             urlh = compat_urllib_request.urlopen(request)
1985             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1986                 basename = url.split('/')[-1]
1987                 title,ext = os.path.splitext(basename)
1988                 title = title.decode('UTF-8')
1989                 ext = ext.replace('.', '')
1990                 self.report_direct_download(title)
1991                 info = {
1992                     'id': title,
1993                     'url': url,
1994                     'uploader': None,
1995                     'upload_date': None,
1996                     'title': title,
1997                     'ext': ext,
1998                     'urlhandle': urlh
1999                 }
2000         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2001             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2002         if info is None: # Regular URL
2003             try:
2004                 json_code_bytes = urlh.read()
2005                 json_code = json_code_bytes.decode('utf-8')
2006             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2007                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2008
2009             try:
2010                 json_data = json.loads(json_code)
2011                 if 'Post' in json_data:
2012                     data = json_data['Post']
2013                 else:
2014                     data = json_data
2015
2016                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2017                 video_url = data['media']['url']
2018                 umobj = re.match(self._URL_EXT, video_url)
2019                 if umobj is None:
2020                     raise ValueError('Can not determine filename extension')
2021                 ext = umobj.group(1)
2022
2023                 info = {
2024                     'id': data['item_id'],
2025                     'url': video_url,
2026                     'uploader': data['display_name'],
2027                     'upload_date': upload_date,
2028                     'title': data['title'],
2029                     'ext': ext,
2030                     'format': data['media']['mimeType'],
2031                     'thumbnail': data['thumbnailUrl'],
2032                     'description': data['description'],
2033                     'player_url': data['embedUrl'],
2034                     'user_agent': 'iTunes/10.6.1',
2035                 }
2036             except (ValueError,KeyError) as err:
2037                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2038
2039         return [info]
2040
2041
2042 class MyVideoIE(InfoExtractor):
2043     """Information Extractor for myvideo.de."""
2044
2045     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2046     IE_NAME = u'myvideo'
2047
2048     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2049     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2050     # https://github.com/rg3/youtube-dl/pull/842
2051     def __rc4crypt(self,data, key):
2052         x = 0
2053         box = list(range(256))
2054         for i in list(range(256)):
2055             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2056             box[i], box[x] = box[x], box[i]
2057         x = 0
2058         y = 0
2059         out = ''
2060         for char in data:
2061             x = (x + 1) % 256
2062             y = (y + box[x]) % 256
2063             box[x], box[y] = box[y], box[x]
2064             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2065         return out
2066
2067     def __md5(self,s):
2068         return hashlib.md5(s).hexdigest().encode()
2069
2070     def _real_extract(self,url):
2071         mobj = re.match(self._VALID_URL, url)
2072         if mobj is None:
2073             raise ExtractorError(u'invalid URL: %s' % url)
2074
2075         video_id = mobj.group(1)
2076
2077         GK = (
2078           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2079           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2080           b'TnpsbA0KTVRkbU1tSTRNdz09'
2081         )
2082
2083         # Get video webpage
2084         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2085         webpage = self._download_webpage(webpage_url, video_id)
2086
2087         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2088         if mobj is not None:
2089             self.report_extraction(video_id)
2090             video_url = mobj.group(1) + '.flv'
2091
2092             video_title = self._html_search_regex('<title>([^<]+)</title>',
2093                 webpage, u'title')
2094
2095             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2096
2097             return [{
2098                 'id':       video_id,
2099                 'url':      video_url,
2100                 'uploader': None,
2101                 'upload_date':  None,
2102                 'title':    video_title,
2103                 'ext':      u'flv',
2104             }]
2105
2106         # try encxml
2107         mobj = re.search('var flashvars={(.+?)}', webpage)
2108         if mobj is None:
2109             raise ExtractorError(u'Unable to extract video')
2110
2111         params = {}
2112         encxml = ''
2113         sec = mobj.group(1)
2114         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2115             if not a == '_encxml':
2116                 params[a] = b
2117             else:
2118                 encxml = compat_urllib_parse.unquote(b)
2119         if not params.get('domain'):
2120             params['domain'] = 'www.myvideo.de'
2121         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2122         if 'flash_playertype=MTV' in xmldata_url:
2123             self._downloader.report_warning(u'avoiding MTV player')
2124             xmldata_url = (
2125                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2126                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2127             ) % video_id
2128
2129         # get enc data
2130         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2131         enc_data_b = binascii.unhexlify(enc_data)
2132         sk = self.__md5(
2133             base64.b64decode(base64.b64decode(GK)) +
2134             self.__md5(
2135                 str(video_id).encode('utf-8')
2136             )
2137         )
2138         dec_data = self.__rc4crypt(enc_data_b, sk)
2139
2140         # extracting infos
2141         self.report_extraction(video_id)
2142
2143         video_url = None
2144         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2145         if mobj:
2146             video_url = compat_urllib_parse.unquote(mobj.group(1))
2147             if 'myvideo2flash' in video_url:
2148                 self._downloader.report_warning(u'forcing RTMPT ...')
2149                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2150
2151         if not video_url:
2152             # extract non rtmp videos
2153             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2154             if mobj is None:
2155                 raise ExtractorError(u'unable to extract url')
2156             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2157
2158         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2159         video_file = compat_urllib_parse.unquote(video_file)
2160
2161         if not video_file.endswith('f4m'):
2162             ppath, prefix = video_file.split('.')
2163             video_playpath = '%s:%s' % (prefix, ppath)
2164             video_hls_playlist = ''
2165         else:
2166             video_playpath = ''
2167             video_hls_playlist = (
2168                 video_filepath + video_file
2169             ).replace('.f4m', '.m3u8')
2170
2171         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2172         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2173
2174         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2175             webpage, u'title')
2176
2177         return [{
2178             'id':                 video_id,
2179             'url':                video_url,
2180             'tc_url':             video_url,
2181             'uploader':           None,
2182             'upload_date':        None,
2183             'title':              video_title,
2184             'ext':                u'flv',
2185             'play_path':          video_playpath,
2186             'video_file':         video_file,
2187             'video_hls_playlist': video_hls_playlist,
2188             'player_url':         video_swfobj,
2189         }]
2190
2191
2192 class ComedyCentralIE(InfoExtractor):
2193     """Information extractor for The Daily Show and Colbert Report """
2194
2195     # urls can be abbreviations like :thedailyshow or :colbert
2196     # urls for episodes like:
2197     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2198     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2199     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2200     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2201                       |(https?://)?(www\.)?
2202                           (?P<showname>thedailyshow|colbertnation)\.com/
2203                          (full-episodes/(?P<episode>.*)|
2204                           (?P<clip>
2205                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2206                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2207                      $"""
2208
2209     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2210
2211     _video_extensions = {
2212         '3500': 'mp4',
2213         '2200': 'mp4',
2214         '1700': 'mp4',
2215         '1200': 'mp4',
2216         '750': 'mp4',
2217         '400': 'mp4',
2218     }
2219     _video_dimensions = {
2220         '3500': '1280x720',
2221         '2200': '960x540',
2222         '1700': '768x432',
2223         '1200': '640x360',
2224         '750': '512x288',
2225         '400': '384x216',
2226     }
2227
2228     @classmethod
2229     def suitable(cls, url):
2230         """Receives a URL and returns True if suitable for this IE."""
2231         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2232
2233     def _print_formats(self, formats):
2234         print('Available formats:')
2235         for x in formats:
2236             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2237
2238
2239     def _real_extract(self, url):
2240         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2241         if mobj is None:
2242             raise ExtractorError(u'Invalid URL: %s' % url)
2243
2244         if mobj.group('shortname'):
2245             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2246                 url = u'http://www.thedailyshow.com/full-episodes/'
2247             else:
2248                 url = u'http://www.colbertnation.com/full-episodes/'
2249             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2250             assert mobj is not None
2251
2252         if mobj.group('clip'):
2253             if mobj.group('showname') == 'thedailyshow':
2254                 epTitle = mobj.group('tdstitle')
2255             else:
2256                 epTitle = mobj.group('cntitle')
2257             dlNewest = False
2258         else:
2259             dlNewest = not mobj.group('episode')
2260             if dlNewest:
2261                 epTitle = mobj.group('showname')
2262             else:
2263                 epTitle = mobj.group('episode')
2264
2265         self.report_extraction(epTitle)
2266         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2267         if dlNewest:
2268             url = htmlHandle.geturl()
2269             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2270             if mobj is None:
2271                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2272             if mobj.group('episode') == '':
2273                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2274             epTitle = mobj.group('episode')
2275
2276         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2277
2278         if len(mMovieParams) == 0:
2279             # The Colbert Report embeds the information in a without
2280             # a URL prefix; so extract the alternate reference
2281             # and then add the URL prefix manually.
2282
2283             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2284             if len(altMovieParams) == 0:
2285                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2286             else:
2287                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2288
2289         uri = mMovieParams[0][1]
2290         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2291         indexXml = self._download_webpage(indexUrl, epTitle,
2292                                           u'Downloading show index',
2293                                           u'unable to download episode index')
2294
2295         results = []
2296
2297         idoc = xml.etree.ElementTree.fromstring(indexXml)
2298         itemEls = idoc.findall('.//item')
2299         for partNum,itemEl in enumerate(itemEls):
2300             mediaId = itemEl.findall('./guid')[0].text
2301             shortMediaId = mediaId.split(':')[-1]
2302             showId = mediaId.split(':')[-2].replace('.com', '')
2303             officialTitle = itemEl.findall('./title')[0].text
2304             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2305
2306             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2307                         compat_urllib_parse.urlencode({'uri': mediaId}))
2308             configXml = self._download_webpage(configUrl, epTitle,
2309                                                u'Downloading configuration for %s' % shortMediaId)
2310
2311             cdoc = xml.etree.ElementTree.fromstring(configXml)
2312             turls = []
2313             for rendition in cdoc.findall('.//rendition'):
2314                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2315                 turls.append(finfo)
2316
2317             if len(turls) == 0:
2318                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2319                 continue
2320
2321             if self._downloader.params.get('listformats', None):
2322                 self._print_formats([i[0] for i in turls])
2323                 return
2324
2325             # For now, just pick the highest bitrate
2326             format,rtmp_video_url = turls[-1]
2327
2328             # Get the format arg from the arg stream
2329             req_format = self._downloader.params.get('format', None)
2330
2331             # Select format if we can find one
2332             for f,v in turls:
2333                 if f == req_format:
2334                     format, rtmp_video_url = f, v
2335                     break
2336
2337             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2338             if not m:
2339                 raise ExtractorError(u'Cannot transform RTMP url')
2340             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2341             video_url = base + m.group('finalid')
2342
2343             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2344             info = {
2345                 'id': shortMediaId,
2346                 'url': video_url,
2347                 'uploader': showId,
2348                 'upload_date': officialDate,
2349                 'title': effTitle,
2350                 'ext': 'mp4',
2351                 'format': format,
2352                 'thumbnail': None,
2353                 'description': officialTitle,
2354             }
2355             results.append(info)
2356
2357         return results
2358
2359
2360 class EscapistIE(InfoExtractor):
2361     """Information extractor for The Escapist """
2362
2363     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2364     IE_NAME = u'escapist'
2365
2366     def _real_extract(self, url):
2367         mobj = re.match(self._VALID_URL, url)
2368         if mobj is None:
2369             raise ExtractorError(u'Invalid URL: %s' % url)
2370         showName = mobj.group('showname')
2371         videoId = mobj.group('episode')
2372
2373         self.report_extraction(videoId)
2374         webpage = self._download_webpage(url, videoId)
2375
2376         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2377             webpage, u'description', fatal=False)
2378
2379         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2380             webpage, u'thumbnail', fatal=False)
2381
2382         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2383             webpage, u'player url')
2384
2385         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2386             webpage, u'player url').split(' : ')[-1]
2387
2388         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2389         configUrl = compat_urllib_parse.unquote(configUrl)
2390
2391         configJSON = self._download_webpage(configUrl, videoId,
2392                                             u'Downloading configuration',
2393                                             u'unable to download configuration')
2394
2395         # Technically, it's JavaScript, not JSON
2396         configJSON = configJSON.replace("'", '"')
2397
2398         try:
2399             config = json.loads(configJSON)
2400         except (ValueError,) as err:
2401             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2402
2403         playlist = config['playlist']
2404         videoUrl = playlist[1]['url']
2405
2406         info = {
2407             'id': videoId,
2408             'url': videoUrl,
2409             'uploader': showName,
2410             'upload_date': None,
2411             'title': title,
2412             'ext': 'mp4',
2413             'thumbnail': imgUrl,
2414             'description': videoDesc,
2415             'player_url': playerUrl,
2416         }
2417
2418         return [info]
2419
2420 class CollegeHumorIE(InfoExtractor):
2421     """Information extractor for collegehumor.com"""
2422
2423     _WORKING = False
2424     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2425     IE_NAME = u'collegehumor'
2426
2427     def report_manifest(self, video_id):
2428         """Report information extraction."""
2429         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2430
2431     def _real_extract(self, url):
2432         mobj = re.match(self._VALID_URL, url)
2433         if mobj is None:
2434             raise ExtractorError(u'Invalid URL: %s' % url)
2435         video_id = mobj.group('videoid')
2436
2437         info = {
2438             'id': video_id,
2439             'uploader': None,
2440             'upload_date': None,
2441         }
2442
2443         self.report_extraction(video_id)
2444         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2445         try:
2446             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2447         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2448             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2449
2450         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2451         try:
2452             videoNode = mdoc.findall('./video')[0]
2453             info['description'] = videoNode.findall('./description')[0].text
2454             info['title'] = videoNode.findall('./caption')[0].text
2455             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2456             manifest_url = videoNode.findall('./file')[0].text
2457         except IndexError:
2458             raise ExtractorError(u'Invalid metadata XML file')
2459
2460         manifest_url += '?hdcore=2.10.3'
2461         self.report_manifest(video_id)
2462         try:
2463             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2464         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2465             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2466
2467         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2468         try:
2469             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2470             node_id = media_node.attrib['url']
2471             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2472         except IndexError as err:
2473             raise ExtractorError(u'Invalid manifest file')
2474
2475         url_pr = compat_urllib_parse_urlparse(manifest_url)
2476         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2477
2478         info['url'] = url
2479         info['ext'] = 'f4f'
2480         return [info]
2481
2482
2483 class XVideosIE(InfoExtractor):
2484     """Information extractor for xvideos.com"""
2485
2486     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2487     IE_NAME = u'xvideos'
2488
2489     def _real_extract(self, url):
2490         mobj = re.match(self._VALID_URL, url)
2491         if mobj is None:
2492             raise ExtractorError(u'Invalid URL: %s' % url)
2493         video_id = mobj.group(1)
2494
2495         webpage = self._download_webpage(url, video_id)
2496
2497         self.report_extraction(video_id)
2498
2499         # Extract video URL
2500         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2501             webpage, u'video URL'))
2502
2503         # Extract title
2504         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2505             webpage, u'title')
2506
2507         # Extract video thumbnail
2508         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2509             webpage, u'thumbnail', fatal=False)
2510
2511         info = {
2512             'id': video_id,
2513             'url': video_url,
2514             'uploader': None,
2515             'upload_date': None,
2516             'title': video_title,
2517             'ext': 'flv',
2518             'thumbnail': video_thumbnail,
2519             'description': None,
2520         }
2521
2522         return [info]
2523
2524
2525 class SoundcloudIE(InfoExtractor):
2526     """Information extractor for soundcloud.com
2527        To access the media, the uid of the song and a stream token
2528        must be extracted from the page source and the script must make
2529        a request to media.soundcloud.com/crossdomain.xml. Then
2530        the media can be grabbed by requesting from an url composed
2531        of the stream token and uid
2532      """
2533
2534     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2535     IE_NAME = u'soundcloud'
2536
2537     def report_resolve(self, video_id):
2538         """Report information extraction."""
2539         self.to_screen(u'%s: Resolving id' % video_id)
2540
2541     def _real_extract(self, url):
2542         mobj = re.match(self._VALID_URL, url)
2543         if mobj is None:
2544             raise ExtractorError(u'Invalid URL: %s' % url)
2545
2546         # extract uploader (which is in the url)
2547         uploader = mobj.group(1)
2548         # extract simple title (uploader + slug of song title)
2549         slug_title =  mobj.group(2)
2550         simple_title = uploader + u'-' + slug_title
2551         full_title = '%s/%s' % (uploader, slug_title)
2552
2553         self.report_resolve(full_title)
2554
2555         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2556         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2557         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2558
2559         info = json.loads(info_json)
2560         video_id = info['id']
2561         self.report_extraction(full_title)
2562
2563         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2564         stream_json = self._download_webpage(streams_url, full_title,
2565                                              u'Downloading stream definitions',
2566                                              u'unable to download stream definitions')
2567
2568         streams = json.loads(stream_json)
2569         mediaURL = streams['http_mp3_128_url']
2570         upload_date = unified_strdate(info['created_at'])
2571
2572         return [{
2573             'id':       info['id'],
2574             'url':      mediaURL,
2575             'uploader': info['user']['username'],
2576             'upload_date': upload_date,
2577             'title':    info['title'],
2578             'ext':      u'mp3',
2579             'description': info['description'],
2580         }]
2581
2582 class SoundcloudSetIE(InfoExtractor):
2583     """Information extractor for soundcloud.com sets
2584        To access the media, the uid of the song and a stream token
2585        must be extracted from the page source and the script must make
2586        a request to media.soundcloud.com/crossdomain.xml. Then
2587        the media can be grabbed by requesting from an url composed
2588        of the stream token and uid
2589      """
2590
2591     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2592     IE_NAME = u'soundcloud:set'
2593
2594     def report_resolve(self, video_id):
2595         """Report information extraction."""
2596         self.to_screen(u'%s: Resolving id' % video_id)
2597
2598     def _real_extract(self, url):
2599         mobj = re.match(self._VALID_URL, url)
2600         if mobj is None:
2601             raise ExtractorError(u'Invalid URL: %s' % url)
2602
2603         # extract uploader (which is in the url)
2604         uploader = mobj.group(1)
2605         # extract simple title (uploader + slug of song title)
2606         slug_title =  mobj.group(2)
2607         simple_title = uploader + u'-' + slug_title
2608         full_title = '%s/sets/%s' % (uploader, slug_title)
2609
2610         self.report_resolve(full_title)
2611
2612         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2613         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2614         info_json = self._download_webpage(resolv_url, full_title)
2615
2616         videos = []
2617         info = json.loads(info_json)
2618         if 'errors' in info:
2619             for err in info['errors']:
2620                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2621             return
2622
2623         self.report_extraction(full_title)
2624         for track in info['tracks']:
2625             video_id = track['id']
2626
2627             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2628             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2629
2630             self.report_extraction(video_id)
2631             streams = json.loads(stream_json)
2632             mediaURL = streams['http_mp3_128_url']
2633
2634             videos.append({
2635                 'id':       video_id,
2636                 'url':      mediaURL,
2637                 'uploader': track['user']['username'],
2638                 'upload_date':  unified_strdate(track['created_at']),
2639                 'title':    track['title'],
2640                 'ext':      u'mp3',
2641                 'description': track['description'],
2642             })
2643         return videos
2644
2645
2646 class InfoQIE(InfoExtractor):
2647     """Information extractor for infoq.com"""
2648     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2649
2650     def _real_extract(self, url):
2651         mobj = re.match(self._VALID_URL, url)
2652         if mobj is None:
2653             raise ExtractorError(u'Invalid URL: %s' % url)
2654
2655         webpage = self._download_webpage(url, video_id=url)
2656         self.report_extraction(url)
2657
2658         # Extract video URL
2659         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2660         if mobj is None:
2661             raise ExtractorError(u'Unable to extract video url')
2662         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2663         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2664
2665         # Extract title
2666         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2667             webpage, u'title')
2668
2669         # Extract description
2670         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2671             webpage, u'description', fatal=False)
2672
2673         video_filename = video_url.split('/')[-1]
2674         video_id, extension = video_filename.split('.')
2675
2676         info = {
2677             'id': video_id,
2678             'url': video_url,
2679             'uploader': None,
2680             'upload_date': None,
2681             'title': video_title,
2682             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2683             'thumbnail': None,
2684             'description': video_description,
2685         }
2686
2687         return [info]
2688
2689 class MixcloudIE(InfoExtractor):
2690     """Information extractor for www.mixcloud.com"""
2691
2692     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2693     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2694     IE_NAME = u'mixcloud'
2695
2696     def report_download_json(self, file_id):
2697         """Report JSON download."""
2698         self.to_screen(u'Downloading json')
2699
2700     def get_urls(self, jsonData, fmt, bitrate='best'):
2701         """Get urls from 'audio_formats' section in json"""
2702         file_url = None
2703         try:
2704             bitrate_list = jsonData[fmt]
2705             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2706                 bitrate = max(bitrate_list) # select highest
2707
2708             url_list = jsonData[fmt][bitrate]
2709         except TypeError: # we have no bitrate info.
2710             url_list = jsonData[fmt]
2711         return url_list
2712
2713     def check_urls(self, url_list):
2714         """Returns 1st active url from list"""
2715         for url in url_list:
2716             try:
2717                 compat_urllib_request.urlopen(url)
2718                 return url
2719             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2720                 url = None
2721
2722         return None
2723
2724     def _print_formats(self, formats):
2725         print('Available formats:')
2726         for fmt in formats.keys():
2727             for b in formats[fmt]:
2728                 try:
2729                     ext = formats[fmt][b][0]
2730                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2731                 except TypeError: # we have no bitrate info
2732                     ext = formats[fmt][0]
2733                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2734                     break
2735
2736     def _real_extract(self, url):
2737         mobj = re.match(self._VALID_URL, url)
2738         if mobj is None:
2739             raise ExtractorError(u'Invalid URL: %s' % url)
2740         # extract uploader & filename from url
2741         uploader = mobj.group(1).decode('utf-8')
2742         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2743
2744         # construct API request
2745         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2746         # retrieve .json file with links to files
2747         request = compat_urllib_request.Request(file_url)
2748         try:
2749             self.report_download_json(file_url)
2750             jsonData = compat_urllib_request.urlopen(request).read()
2751         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2752             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2753
2754         # parse JSON
2755         json_data = json.loads(jsonData)
2756         player_url = json_data['player_swf_url']
2757         formats = dict(json_data['audio_formats'])
2758
2759         req_format = self._downloader.params.get('format', None)
2760         bitrate = None
2761
2762         if self._downloader.params.get('listformats', None):
2763             self._print_formats(formats)
2764             return
2765
2766         if req_format is None or req_format == 'best':
2767             for format_param in formats.keys():
2768                 url_list = self.get_urls(formats, format_param)
2769                 # check urls
2770                 file_url = self.check_urls(url_list)
2771                 if file_url is not None:
2772                     break # got it!
2773         else:
2774             if req_format not in formats:
2775                 raise ExtractorError(u'Format is not available')
2776
2777             url_list = self.get_urls(formats, req_format)
2778             file_url = self.check_urls(url_list)
2779             format_param = req_format
2780
2781         return [{
2782             'id': file_id.decode('utf-8'),
2783             'url': file_url.decode('utf-8'),
2784             'uploader': uploader.decode('utf-8'),
2785             'upload_date': None,
2786             'title': json_data['name'],
2787             'ext': file_url.split('.')[-1].decode('utf-8'),
2788             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2789             'thumbnail': json_data['thumbnail_url'],
2790             'description': json_data['description'],
2791             'player_url': player_url.decode('utf-8'),
2792         }]
2793
2794 class StanfordOpenClassroomIE(InfoExtractor):
2795     """Information extractor for Stanford's Open ClassRoom"""
2796
2797     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2798     IE_NAME = u'stanfordoc'
2799
2800     def _real_extract(self, url):
2801         mobj = re.match(self._VALID_URL, url)
2802         if mobj is None:
2803             raise ExtractorError(u'Invalid URL: %s' % url)
2804
2805         if mobj.group('course') and mobj.group('video'): # A specific video
2806             course = mobj.group('course')
2807             video = mobj.group('video')
2808             info = {
2809                 'id': course + '_' + video,
2810                 'uploader': None,
2811                 'upload_date': None,
2812             }
2813
2814             self.report_extraction(info['id'])
2815             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2816             xmlUrl = baseUrl + video + '.xml'
2817             try:
2818                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2819             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2820                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2821             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2822             try:
2823                 info['title'] = mdoc.findall('./title')[0].text
2824                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2825             except IndexError:
2826                 raise ExtractorError(u'Invalid metadata XML file')
2827             info['ext'] = info['url'].rpartition('.')[2]
2828             return [info]
2829         elif mobj.group('course'): # A course page
2830             course = mobj.group('course')
2831             info = {
2832                 'id': course,
2833                 'type': 'playlist',
2834                 'uploader': None,
2835                 'upload_date': None,
2836             }
2837
2838             coursepage = self._download_webpage(url, info['id'],
2839                                         note='Downloading course info page',
2840                                         errnote='Unable to download course info page')
2841
2842             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2843
2844             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2845                 coursepage, u'description', fatal=False)
2846
2847             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2848             info['list'] = [
2849                 {
2850                     'type': 'reference',
2851                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2852                 }
2853                     for vpage in links]
2854             results = []
2855             for entry in info['list']:
2856                 assert entry['type'] == 'reference'
2857                 results += self.extract(entry['url'])
2858             return results
2859         else: # Root page
2860             info = {
2861                 'id': 'Stanford OpenClassroom',
2862                 'type': 'playlist',
2863                 'uploader': None,
2864                 'upload_date': None,
2865             }
2866
2867             self.report_download_webpage(info['id'])
2868             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2869             try:
2870                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2871             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2872                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2873
2874             info['title'] = info['id']
2875
2876             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2877             info['list'] = [
2878                 {
2879                     'type': 'reference',
2880                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2881                 }
2882                     for cpage in links]
2883
2884             results = []
2885             for entry in info['list']:
2886                 assert entry['type'] == 'reference'
2887                 results += self.extract(entry['url'])
2888             return results
2889
2890 class MTVIE(InfoExtractor):
2891     """Information extractor for MTV.com"""
2892
2893     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2894     IE_NAME = u'mtv'
2895
2896     def _real_extract(self, url):
2897         mobj = re.match(self._VALID_URL, url)
2898         if mobj is None:
2899             raise ExtractorError(u'Invalid URL: %s' % url)
2900         if not mobj.group('proto'):
2901             url = 'http://' + url
2902         video_id = mobj.group('videoid')
2903
2904         webpage = self._download_webpage(url, video_id)
2905
2906         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2907             webpage, u'song name', fatal=False)
2908
2909         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2910             webpage, u'title')
2911
2912         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2913             webpage, u'mtvn_uri', fatal=False)
2914
2915         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2916             webpage, u'content id', fatal=False)
2917
2918         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2919         self.report_extraction(video_id)
2920         request = compat_urllib_request.Request(videogen_url)
2921         try:
2922             metadataXml = compat_urllib_request.urlopen(request).read()
2923         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2924             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2925
2926         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2927         renditions = mdoc.findall('.//rendition')
2928
2929         # For now, always pick the highest quality.
2930         rendition = renditions[-1]
2931
2932         try:
2933             _,_,ext = rendition.attrib['type'].partition('/')
2934             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2935             video_url = rendition.find('./src').text
2936         except KeyError:
2937             raise ExtractorError('Invalid rendition field.')
2938
2939         info = {
2940             'id': video_id,
2941             'url': video_url,
2942             'uploader': performer,
2943             'upload_date': None,
2944             'title': video_title,
2945             'ext': ext,
2946             'format': format,
2947         }
2948
2949         return [info]
2950
2951
2952 class YoukuIE(InfoExtractor):
2953     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2954
2955     def _gen_sid(self):
2956         nowTime = int(time.time() * 1000)
2957         random1 = random.randint(1000,1998)
2958         random2 = random.randint(1000,9999)
2959
2960         return "%d%d%d" %(nowTime,random1,random2)
2961
2962     def _get_file_ID_mix_string(self, seed):
2963         mixed = []
2964         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2965         seed = float(seed)
2966         for i in range(len(source)):
2967             seed  =  (seed * 211 + 30031 ) % 65536
2968             index  =  math.floor(seed / 65536 * len(source) )
2969             mixed.append(source[int(index)])
2970             source.remove(source[int(index)])
2971         #return ''.join(mixed)
2972         return mixed
2973
2974     def _get_file_id(self, fileId, seed):
2975         mixed = self._get_file_ID_mix_string(seed)
2976         ids = fileId.split('*')
2977         realId = []
2978         for ch in ids:
2979             if ch:
2980                 realId.append(mixed[int(ch)])
2981         return ''.join(realId)
2982
2983     def _real_extract(self, url):
2984         mobj = re.match(self._VALID_URL, url)
2985         if mobj is None:
2986             raise ExtractorError(u'Invalid URL: %s' % url)
2987         video_id = mobj.group('ID')
2988
2989         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2990
2991         jsondata = self._download_webpage(info_url, video_id)
2992
2993         self.report_extraction(video_id)
2994         try:
2995             config = json.loads(jsondata)
2996
2997             video_title =  config['data'][0]['title']
2998             seed = config['data'][0]['seed']
2999
3000             format = self._downloader.params.get('format', None)
3001             supported_format = list(config['data'][0]['streamfileids'].keys())
3002
3003             if format is None or format == 'best':
3004                 if 'hd2' in supported_format:
3005                     format = 'hd2'
3006                 else:
3007                     format = 'flv'
3008                 ext = u'flv'
3009             elif format == 'worst':
3010                 format = 'mp4'
3011                 ext = u'mp4'
3012             else:
3013                 format = 'flv'
3014                 ext = u'flv'
3015
3016
3017             fileid = config['data'][0]['streamfileids'][format]
3018             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3019         except (UnicodeDecodeError, ValueError, KeyError):
3020             raise ExtractorError(u'Unable to extract info section')
3021
3022         files_info=[]
3023         sid = self._gen_sid()
3024         fileid = self._get_file_id(fileid, seed)
3025
3026         #column 8,9 of fileid represent the segment number
3027         #fileid[7:9] should be changed
3028         for index, key in enumerate(keys):
3029
3030             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3031             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3032
3033             info = {
3034                 'id': '%s_part%02d' % (video_id, index),
3035                 'url': download_url,
3036                 'uploader': None,
3037                 'upload_date': None,
3038                 'title': video_title,
3039                 'ext': ext,
3040             }
3041             files_info.append(info)
3042
3043         return files_info
3044
3045
3046 class XNXXIE(InfoExtractor):
3047     """Information extractor for xnxx.com"""
3048
3049     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3050     IE_NAME = u'xnxx'
3051     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3052     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3053     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3054
3055     def _real_extract(self, url):
3056         mobj = re.match(self._VALID_URL, url)
3057         if mobj is None:
3058             raise ExtractorError(u'Invalid URL: %s' % url)
3059         video_id = mobj.group(1)
3060
3061         # Get webpage content
3062         webpage = self._download_webpage(url, video_id)
3063
3064         video_url = self._search_regex(self.VIDEO_URL_RE,
3065             webpage, u'video URL')
3066         video_url = compat_urllib_parse.unquote(video_url)
3067
3068         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3069             webpage, u'title')
3070
3071         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3072             webpage, u'thumbnail', fatal=False)
3073
3074         return [{
3075             'id': video_id,
3076             'url': video_url,
3077             'uploader': None,
3078             'upload_date': None,
3079             'title': video_title,
3080             'ext': 'flv',
3081             'thumbnail': video_thumbnail,
3082             'description': None,
3083         }]
3084
3085
3086 class GooglePlusIE(InfoExtractor):
3087     """Information extractor for plus.google.com."""
3088
3089     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3090     IE_NAME = u'plus.google'
3091
3092     def _real_extract(self, url):
3093         # Extract id from URL
3094         mobj = re.match(self._VALID_URL, url)
3095         if mobj is None:
3096             raise ExtractorError(u'Invalid URL: %s' % url)
3097
3098         post_url = mobj.group(0)
3099         video_id = mobj.group(1)
3100
3101         video_extension = 'flv'
3102
3103         # Step 1, Retrieve post webpage to extract further information
3104         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3105
3106         self.report_extraction(video_id)
3107
3108         # Extract update date
3109         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3110             webpage, u'upload date', fatal=False)
3111         if upload_date:
3112             # Convert timestring to a format suitable for filename
3113             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3114             upload_date = upload_date.strftime('%Y%m%d')
3115
3116         # Extract uploader
3117         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3118             webpage, u'uploader', fatal=False)
3119
3120         # Extract title
3121         # Get the first line for title
3122         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3123             webpage, 'title', default=u'NA')
3124
3125         # Step 2, Stimulate clicking the image box to launch video
3126         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3127             webpage, u'video page URL')
3128         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3129
3130         # Extract video links on video page
3131         """Extract video links of all sizes"""
3132         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3133         mobj = re.findall(pattern, webpage)
3134         if len(mobj) == 0:
3135             raise ExtractorError(u'Unable to extract video links')
3136
3137         # Sort in resolution
3138         links = sorted(mobj)
3139
3140         # Choose the lowest of the sort, i.e. highest resolution
3141         video_url = links[-1]
3142         # Only get the url. The resolution part in the tuple has no use anymore
3143         video_url = video_url[-1]
3144         # Treat escaped \u0026 style hex
3145         try:
3146             video_url = video_url.decode("unicode_escape")
3147         except AttributeError: # Python 3
3148             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3149
3150
3151         return [{
3152             'id':       video_id,
3153             'url':      video_url,
3154             'uploader': uploader,
3155             'upload_date':  upload_date,
3156             'title':    video_title,
3157             'ext':      video_extension,
3158         }]
3159
3160 class NBAIE(InfoExtractor):
3161     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3162     IE_NAME = u'nba'
3163
3164     def _real_extract(self, url):
3165         mobj = re.match(self._VALID_URL, url)
3166         if mobj is None:
3167             raise ExtractorError(u'Invalid URL: %s' % url)
3168
3169         video_id = mobj.group(1)
3170
3171         webpage = self._download_webpage(url, video_id)
3172
3173         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3174
3175         shortened_video_id = video_id.rpartition('/')[2]
3176         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3177             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3178
3179         # It isn't there in the HTML it returns to us
3180         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3181
3182         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3183
3184         info = {
3185             'id': shortened_video_id,
3186             'url': video_url,
3187             'ext': 'mp4',
3188             'title': title,
3189             # 'uploader_date': uploader_date,
3190             'description': description,
3191         }
3192         return [info]
3193
3194 class JustinTVIE(InfoExtractor):
3195     """Information extractor for justin.tv and twitch.tv"""
3196     # TODO: One broadcast may be split into multiple videos. The key
3197     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3198     # starts at 1 and increases. Can we treat all parts as one video?
3199
3200     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3201         (?:
3202             (?P<channelid>[^/]+)|
3203             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3204             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3205         )
3206         /?(?:\#.*)?$
3207         """
3208     _JUSTIN_PAGE_LIMIT = 100
3209     IE_NAME = u'justin.tv'
3210
3211     def report_download_page(self, channel, offset):
3212         """Report attempt to download a single page of videos."""
3213         self.to_screen(u'%s: Downloading video information from %d to %d' %
3214                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3215
3216     # Return count of items, list of *valid* items
3217     def _parse_page(self, url, video_id):
3218         webpage = self._download_webpage(url, video_id,
3219                                          u'Downloading video info JSON',
3220                                          u'unable to download video info JSON')
3221
3222         response = json.loads(webpage)
3223         if type(response) != list:
3224             error_text = response.get('error', 'unknown error')
3225             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3226         info = []
3227         for clip in response:
3228             video_url = clip['video_file_url']
3229             if video_url:
3230                 video_extension = os.path.splitext(video_url)[1][1:]
3231                 video_date = re.sub('-', '', clip['start_time'][:10])
3232                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3233                 video_id = clip['id']
3234                 video_title = clip.get('title', video_id)
3235                 info.append({
3236                     'id': video_id,
3237                     'url': video_url,
3238                     'title': video_title,
3239                     'uploader': clip.get('channel_name', video_uploader_id),
3240                     'uploader_id': video_uploader_id,
3241                     'upload_date': video_date,
3242                     'ext': video_extension,
3243                 })
3244         return (len(response), info)
3245
3246     def _real_extract(self, url):
3247         mobj = re.match(self._VALID_URL, url)
3248         if mobj is None:
3249             raise ExtractorError(u'invalid URL: %s' % url)
3250
3251         api_base = 'http://api.justin.tv'
3252         paged = False
3253         if mobj.group('channelid'):
3254             paged = True
3255             video_id = mobj.group('channelid')
3256             api = api_base + '/channel/archives/%s.json' % video_id
3257         elif mobj.group('chapterid'):
3258             chapter_id = mobj.group('chapterid')
3259
3260             webpage = self._download_webpage(url, chapter_id)
3261             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3262             if not m:
3263                 raise ExtractorError(u'Cannot find archive of a chapter')
3264             archive_id = m.group(1)
3265
3266             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3267             chapter_info_xml = self._download_webpage(api, chapter_id,
3268                                              note=u'Downloading chapter information',
3269                                              errnote=u'Chapter information download failed')
3270             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3271             for a in doc.findall('.//archive'):
3272                 if archive_id == a.find('./id').text:
3273                     break
3274             else:
3275                 raise ExtractorError(u'Could not find chapter in chapter information')
3276
3277             video_url = a.find('./video_file_url').text
3278             video_ext = video_url.rpartition('.')[2] or u'flv'
3279
3280             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3281             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3282                                    note='Downloading chapter metadata',
3283                                    errnote='Download of chapter metadata failed')
3284             chapter_info = json.loads(chapter_info_json)
3285
3286             bracket_start = int(doc.find('.//bracket_start').text)
3287             bracket_end = int(doc.find('.//bracket_end').text)
3288
3289             # TODO determine start (and probably fix up file)
3290             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3291             #video_url += u'?start=' + TODO:start_timestamp
3292             # bracket_start is 13290, but we want 51670615
3293             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3294                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3295
3296             info = {
3297                 'id': u'c' + chapter_id,
3298                 'url': video_url,
3299                 'ext': video_ext,
3300                 'title': chapter_info['title'],
3301                 'thumbnail': chapter_info['preview'],
3302                 'description': chapter_info['description'],
3303                 'uploader': chapter_info['channel']['display_name'],
3304                 'uploader_id': chapter_info['channel']['name'],
3305             }
3306             return [info]
3307         else:
3308             video_id = mobj.group('videoid')
3309             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3310
3311         self.report_extraction(video_id)
3312
3313         info = []
3314         offset = 0
3315         limit = self._JUSTIN_PAGE_LIMIT
3316         while True:
3317             if paged:
3318                 self.report_download_page(video_id, offset)
3319             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3320             page_count, page_info = self._parse_page(page_url, video_id)
3321             info.extend(page_info)
3322             if not paged or page_count != limit:
3323                 break
3324             offset += limit
3325         return info
3326
3327 class FunnyOrDieIE(InfoExtractor):
3328     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3329
3330     def _real_extract(self, url):
3331         mobj = re.match(self._VALID_URL, url)
3332         if mobj is None:
3333             raise ExtractorError(u'invalid URL: %s' % url)
3334
3335         video_id = mobj.group('id')
3336         webpage = self._download_webpage(url, video_id)
3337
3338         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3339             webpage, u'video URL', flags=re.DOTALL)
3340
3341         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3342             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3343
3344         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3345             webpage, u'description', fatal=False, flags=re.DOTALL)
3346
3347         info = {
3348             'id': video_id,
3349             'url': video_url,
3350             'ext': 'mp4',
3351             'title': title,
3352             'description': video_description,
3353         }
3354         return [info]
3355
3356 class SteamIE(InfoExtractor):
3357     _VALID_URL = r"""http://store\.steampowered\.com/
3358                 (agecheck/)?
3359                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3360                 (?P<gameID>\d+)/?
3361                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3362                 """
3363     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3364     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3365
3366     @classmethod
3367     def suitable(cls, url):
3368         """Receives a URL and returns True if suitable for this IE."""
3369         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3370
3371     def _real_extract(self, url):
3372         m = re.match(self._VALID_URL, url, re.VERBOSE)
3373         gameID = m.group('gameID')
3374
3375         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3376         webpage = self._download_webpage(videourl, gameID)
3377
3378         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3379             videourl = self._AGECHECK_TEMPLATE % gameID
3380             self.report_age_confirmation()
3381             webpage = self._download_webpage(videourl, gameID)
3382
3383         self.report_extraction(gameID)
3384         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3385                                              webpage, 'game title')
3386
3387         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3388         mweb = re.finditer(urlRE, webpage)
3389         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3390         titles = re.finditer(namesRE, webpage)
3391         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3392         thumbs = re.finditer(thumbsRE, webpage)
3393         videos = []
3394         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3395             video_id = vid.group('videoID')
3396             title = vtitle.group('videoName')
3397             video_url = vid.group('videoURL')
3398             video_thumb = thumb.group('thumbnail')
3399             if not video_url:
3400                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3401             info = {
3402                 'id':video_id,
3403                 'url':video_url,
3404                 'ext': 'flv',
3405                 'title': unescapeHTML(title),
3406                 'thumbnail': video_thumb
3407                   }
3408             videos.append(info)
3409         return [self.playlist_result(videos, gameID, game_title)]
3410
3411 class UstreamIE(InfoExtractor):
3412     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3413     IE_NAME = u'ustream'
3414
3415     def _real_extract(self, url):
3416         m = re.match(self._VALID_URL, url)
3417         video_id = m.group('videoID')
3418
3419         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3420         webpage = self._download_webpage(url, video_id)
3421
3422         self.report_extraction(video_id)
3423
3424         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3425             webpage, u'title')
3426
3427         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3428             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3429
3430         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3431             webpage, u'thumbnail', fatal=False)
3432
3433         info = {
3434                 'id': video_id,
3435                 'url': video_url,
3436                 'ext': 'flv',
3437                 'title': video_title,
3438                 'uploader': uploader,
3439                 'thumbnail': thumbnail,
3440                }
3441         return info
3442
3443 class WorldStarHipHopIE(InfoExtractor):
3444     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3445     IE_NAME = u'WorldStarHipHop'
3446
3447     def _real_extract(self, url):
3448         m = re.match(self._VALID_URL, url)
3449         video_id = m.group('id')
3450
3451         webpage_src = self._download_webpage(url, video_id)
3452
3453         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3454             webpage_src, u'video URL')
3455
3456         if 'mp4' in video_url:
3457             ext = 'mp4'
3458         else:
3459             ext = 'flv'
3460
3461         video_title = self._html_search_regex(r"<title>(.*)</title>",
3462             webpage_src, u'title')
3463
3464         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3465         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3466             webpage_src, u'thumbnail', fatal=False)
3467
3468         if not thumbnail:
3469             _title = r"""candytitles.*>(.*)</span>"""
3470             mobj = re.search(_title, webpage_src)
3471             if mobj is not None:
3472                 video_title = mobj.group(1)
3473
3474         results = [{
3475                     'id': video_id,
3476                     'url' : video_url,
3477                     'title' : video_title,
3478                     'thumbnail' : thumbnail,
3479                     'ext' : ext,
3480                     }]
3481         return results
3482
3483 class RBMARadioIE(InfoExtractor):
3484     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3485
3486     def _real_extract(self, url):
3487         m = re.match(self._VALID_URL, url)
3488         video_id = m.group('videoID')
3489
3490         webpage = self._download_webpage(url, video_id)
3491
3492         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3493             webpage, u'json data', flags=re.MULTILINE)
3494
3495         try:
3496             data = json.loads(json_data)
3497         except ValueError as e:
3498             raise ExtractorError(u'Invalid JSON: ' + str(e))
3499
3500         video_url = data['akamai_url'] + '&cbr=256'
3501         url_parts = compat_urllib_parse_urlparse(video_url)
3502         video_ext = url_parts.path.rpartition('.')[2]
3503         info = {
3504                 'id': video_id,
3505                 'url': video_url,
3506                 'ext': video_ext,
3507                 'title': data['title'],
3508                 'description': data.get('teaser_text'),
3509                 'location': data.get('country_of_origin'),
3510                 'uploader': data.get('host', {}).get('name'),
3511                 'uploader_id': data.get('host', {}).get('slug'),
3512                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3513                 'duration': data.get('duration'),
3514         }
3515         return [info]
3516
3517
3518 class YouPornIE(InfoExtractor):
3519     """Information extractor for youporn.com."""
3520     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3521
3522     def _print_formats(self, formats):
3523         """Print all available formats"""
3524         print(u'Available formats:')
3525         print(u'ext\t\tformat')
3526         print(u'---------------------------------')
3527         for format in formats:
3528             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3529
3530     def _specific(self, req_format, formats):
3531         for x in formats:
3532             if(x["format"]==req_format):
3533                 return x
3534         return None
3535
3536     def _real_extract(self, url):
3537         mobj = re.match(self._VALID_URL, url)
3538         if mobj is None:
3539             raise ExtractorError(u'Invalid URL: %s' % url)
3540         video_id = mobj.group('videoid')
3541
3542         req = compat_urllib_request.Request(url)
3543         req.add_header('Cookie', 'age_verified=1')
3544         webpage = self._download_webpage(req, video_id)
3545
3546         # Get JSON parameters
3547         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3548         try:
3549             params = json.loads(json_params)
3550         except:
3551             raise ExtractorError(u'Invalid JSON')
3552
3553         self.report_extraction(video_id)
3554         try:
3555             video_title = params['title']
3556             upload_date = unified_strdate(params['release_date_f'])
3557             video_description = params['description']
3558             video_uploader = params['submitted_by']
3559             thumbnail = params['thumbnails'][0]['image']
3560         except KeyError:
3561             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3562
3563         # Get all of the formats available
3564         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3565         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3566             webpage, u'download list').strip()
3567
3568         # Get all of the links from the page
3569         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3570         links = re.findall(LINK_RE, download_list_html)
3571         if(len(links) == 0):
3572             raise ExtractorError(u'ERROR: no known formats available for video')
3573
3574         self.to_screen(u'Links found: %d' % len(links))
3575
3576         formats = []
3577         for link in links:
3578
3579             # A link looks like this:
3580             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3581             # A path looks like this:
3582             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3583             video_url = unescapeHTML( link )
3584             path = compat_urllib_parse_urlparse( video_url ).path
3585             extension = os.path.splitext( path )[1][1:]
3586             format = path.split('/')[4].split('_')[:2]
3587             size = format[0]
3588             bitrate = format[1]
3589             format = "-".join( format )
3590             # title = u'%s-%s-%s' % (video_title, size, bitrate)
3591
3592             formats.append({
3593                 'id': video_id,
3594                 'url': video_url,
3595                 'uploader': video_uploader,
3596                 'upload_date': upload_date,
3597                 'title': video_title,
3598                 'ext': extension,
3599                 'format': format,
3600                 'thumbnail': thumbnail,
3601                 'description': video_description
3602             })
3603
3604         if self._downloader.params.get('listformats', None):
3605             self._print_formats(formats)
3606             return
3607
3608         req_format = self._downloader.params.get('format', None)
3609         self.to_screen(u'Format: %s' % req_format)
3610
3611         if req_format is None or req_format == 'best':
3612             return [formats[0]]
3613         elif req_format == 'worst':
3614             return [formats[-1]]
3615         elif req_format in ('-1', 'all'):
3616             return formats
3617         else:
3618             format = self._specific( req_format, formats )
3619             if result is None:
3620                 raise ExtractorError(u'Requested format not available')
3621             return [format]
3622
3623
3624
3625 class PornotubeIE(InfoExtractor):
3626     """Information extractor for pornotube.com."""
3627     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3628
3629     def _real_extract(self, url):
3630         mobj = re.match(self._VALID_URL, url)
3631         if mobj is None:
3632             raise ExtractorError(u'Invalid URL: %s' % url)
3633
3634         video_id = mobj.group('videoid')
3635         video_title = mobj.group('title')
3636
3637         # Get webpage content
3638         webpage = self._download_webpage(url, video_id)
3639
3640         # Get the video URL
3641         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3642         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3643         video_url = compat_urllib_parse.unquote(video_url)
3644
3645         #Get the uploaded date
3646         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3647         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3648         if upload_date: upload_date = unified_strdate(upload_date)
3649
3650         info = {'id': video_id,
3651                 'url': video_url,
3652                 'uploader': None,
3653                 'upload_date': upload_date,
3654                 'title': video_title,
3655                 'ext': 'flv',
3656                 'format': 'flv'}
3657
3658         return [info]
3659
3660 class YouJizzIE(InfoExtractor):
3661     """Information extractor for youjizz.com."""
3662     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3663
3664     def _real_extract(self, url):
3665         mobj = re.match(self._VALID_URL, url)
3666         if mobj is None:
3667             raise ExtractorError(u'Invalid URL: %s' % url)
3668
3669         video_id = mobj.group('videoid')
3670
3671         # Get webpage content
3672         webpage = self._download_webpage(url, video_id)
3673
3674         # Get the video title
3675         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3676             webpage, u'title').strip()
3677
3678         # Get the embed page
3679         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3680         if result is None:
3681             raise ExtractorError(u'ERROR: unable to extract embed page')
3682
3683         embed_page_url = result.group(0).strip()
3684         video_id = result.group('videoid')
3685
3686         webpage = self._download_webpage(embed_page_url, video_id)
3687
3688         # Get the video URL
3689         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3690             webpage, u'video URL')
3691
3692         info = {'id': video_id,
3693                 'url': video_url,
3694                 'title': video_title,
3695                 'ext': 'flv',
3696                 'format': 'flv',
3697                 'player_url': embed_page_url}
3698
3699         return [info]
3700
3701 class EightTracksIE(InfoExtractor):
3702     IE_NAME = '8tracks'
3703     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3704
3705     def _real_extract(self, url):
3706         mobj = re.match(self._VALID_URL, url)
3707         if mobj is None:
3708             raise ExtractorError(u'Invalid URL: %s' % url)
3709         playlist_id = mobj.group('id')
3710
3711         webpage = self._download_webpage(url, playlist_id)
3712
3713         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3714         data = json.loads(json_like)
3715
3716         session = str(random.randint(0, 1000000000))
3717         mix_id = data['id']
3718         track_count = data['tracks_count']
3719         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3720         next_url = first_url
3721         res = []
3722         for i in itertools.count():
3723             api_json = self._download_webpage(next_url, playlist_id,
3724                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3725                 errnote=u'Failed to download song information')
3726             api_data = json.loads(api_json)
3727             track_data = api_data[u'set']['track']
3728             info = {
3729                 'id': track_data['id'],
3730                 'url': track_data['track_file_stream_url'],
3731                 'title': track_data['performer'] + u' - ' + track_data['name'],
3732                 'raw_title': track_data['name'],
3733                 'uploader_id': data['user']['login'],
3734                 'ext': 'm4a',
3735             }
3736             res.append(info)
3737             if api_data['set']['at_last_track']:
3738                 break
3739             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3740         return res
3741
3742 class KeekIE(InfoExtractor):
3743     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3744     IE_NAME = u'keek'
3745
3746     def _real_extract(self, url):
3747         m = re.match(self._VALID_URL, url)
3748         video_id = m.group('videoID')
3749
3750         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3751         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3752         webpage = self._download_webpage(url, video_id)
3753
3754         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3755             webpage, u'title')
3756
3757         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3758             webpage, u'uploader', fatal=False)
3759
3760         info = {
3761                 'id': video_id,
3762                 'url': video_url,
3763                 'ext': 'mp4',
3764                 'title': video_title,
3765                 'thumbnail': thumbnail,
3766                 'uploader': uploader
3767         }
3768         return [info]
3769
3770 class TEDIE(InfoExtractor):
3771     _VALID_URL=r'''http://www\.ted\.com/
3772                    (
3773                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3774                         |
3775                         ((?P<type_talk>talks)) # We have a simple talk
3776                    )
3777                    (/lang/(.*?))? # The url may contain the language
3778                    /(?P<name>\w+) # Here goes the name and then ".html"
3779                    '''
3780
3781     @classmethod
3782     def suitable(cls, url):
3783         """Receives a URL and returns True if suitable for this IE."""
3784         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3785
3786     def _real_extract(self, url):
3787         m=re.match(self._VALID_URL, url, re.VERBOSE)
3788         if m.group('type_talk'):
3789             return [self._talk_info(url)]
3790         else :
3791             playlist_id=m.group('playlist_id')
3792             name=m.group('name')
3793             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3794             return [self._playlist_videos_info(url,name,playlist_id)]
3795
3796     def _talk_video_link(self,mediaSlug):
3797         '''Returns the video link for that mediaSlug'''
3798         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3799
3800     def _playlist_videos_info(self,url,name,playlist_id=0):
3801         '''Returns the videos of the playlist'''
3802         video_RE=r'''
3803                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3804                      ([.\s]*?)data-playlist_item_id="(\d+)"
3805                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3806                      '''
3807         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3808         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3809         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3810         m_names=re.finditer(video_name_RE,webpage)
3811
3812         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3813         m_playlist = re.search(playlist_RE, webpage)
3814         playlist_title = m_playlist.group('playlist_title')
3815
3816         playlist_entries = []
3817         for m_video, m_name in zip(m_videos,m_names):
3818             video_id=m_video.group('video_id')
3819             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3820             playlist_entries.append(self.url_result(talk_url, 'TED'))
3821         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3822
3823     def _talk_info(self, url, video_id=0):
3824         """Return the video for the talk in the url"""
3825         m=re.match(self._VALID_URL, url,re.VERBOSE)
3826         videoName=m.group('name')
3827         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3828         # If the url includes the language we get the title translated
3829         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3830         title=re.search(title_RE, webpage).group('title')
3831         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3832                         "id":(?P<videoID>[\d]+).*?
3833                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3834         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3835         thumb_match=re.search(thumb_RE,webpage)
3836         info_match=re.search(info_RE,webpage,re.VERBOSE)
3837         video_id=info_match.group('videoID')
3838         mediaSlug=info_match.group('mediaSlug')
3839         video_url=self._talk_video_link(mediaSlug)
3840         info = {
3841                 'id': video_id,
3842                 'url': video_url,
3843                 'ext': 'mp4',
3844                 'title': title,
3845                 'thumbnail': thumb_match.group('thumbnail')
3846                 }
3847         return info
3848
3849 class MySpassIE(InfoExtractor):
3850     _VALID_URL = r'http://www.myspass.de/.*'
3851
3852     def _real_extract(self, url):
3853         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3854
3855         # video id is the last path element of the URL
3856         # usually there is a trailing slash, so also try the second but last
3857         url_path = compat_urllib_parse_urlparse(url).path
3858         url_parent_path, video_id = os.path.split(url_path)
3859         if not video_id:
3860             _, video_id = os.path.split(url_parent_path)
3861
3862         # get metadata
3863         metadata_url = META_DATA_URL_TEMPLATE % video_id
3864         metadata_text = self._download_webpage(metadata_url, video_id)
3865         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3866
3867         # extract values from metadata
3868         url_flv_el = metadata.find('url_flv')
3869         if url_flv_el is None:
3870             raise ExtractorError(u'Unable to extract download url')
3871         video_url = url_flv_el.text
3872         extension = os.path.splitext(video_url)[1][1:]
3873         title_el = metadata.find('title')
3874         if title_el is None:
3875             raise ExtractorError(u'Unable to extract title')
3876         title = title_el.text
3877         format_id_el = metadata.find('format_id')
3878         if format_id_el is None:
3879             format = ext
3880         else:
3881             format = format_id_el.text
3882         description_el = metadata.find('description')
3883         if description_el is not None:
3884             description = description_el.text
3885         else:
3886             description = None
3887         imagePreview_el = metadata.find('imagePreview')
3888         if imagePreview_el is not None:
3889             thumbnail = imagePreview_el.text
3890         else:
3891             thumbnail = None
3892         info = {
3893             'id': video_id,
3894             'url': video_url,
3895             'title': title,
3896             'ext': extension,
3897             'format': format,
3898             'thumbnail': thumbnail,
3899             'description': description
3900         }
3901         return [info]
3902
3903 class SpiegelIE(InfoExtractor):
3904     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3905
3906     def _real_extract(self, url):
3907         m = re.match(self._VALID_URL, url)
3908         video_id = m.group('videoID')
3909
3910         webpage = self._download_webpage(url, video_id)
3911
3912         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3913             webpage, u'title')
3914
3915         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3916         xml_code = self._download_webpage(xml_url, video_id,
3917                     note=u'Downloading XML', errnote=u'Failed to download XML')
3918
3919         idoc = xml.etree.ElementTree.fromstring(xml_code)
3920         last_type = idoc[-1]
3921         filename = last_type.findall('./filename')[0].text
3922         duration = float(last_type.findall('./duration')[0].text)
3923
3924         video_url = 'http://video2.spiegel.de/flash/' + filename
3925         video_ext = filename.rpartition('.')[2]
3926         info = {
3927             'id': video_id,
3928             'url': video_url,
3929             'ext': video_ext,
3930             'title': video_title,
3931             'duration': duration,
3932         }
3933         return [info]
3934
3935 class LiveLeakIE(InfoExtractor):
3936
3937     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3938     IE_NAME = u'liveleak'
3939
3940     def _real_extract(self, url):
3941         mobj = re.match(self._VALID_URL, url)
3942         if mobj is None:
3943             raise ExtractorError(u'Invalid URL: %s' % url)
3944
3945         video_id = mobj.group('video_id')
3946
3947         webpage = self._download_webpage(url, video_id)
3948
3949         video_url = self._search_regex(r'file: "(.*?)",',
3950             webpage, u'video URL')
3951
3952         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3953             webpage, u'title').replace('LiveLeak.com -', '').strip()
3954
3955         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3956             webpage, u'description', fatal=False)
3957
3958         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3959             webpage, u'uploader', fatal=False)
3960
3961         info = {
3962             'id':  video_id,
3963             'url': video_url,
3964             'ext': 'mp4',
3965             'title': video_title,
3966             'description': video_description,
3967             'uploader': video_uploader
3968         }
3969
3970         return [info]
3971
3972 class ARDIE(InfoExtractor):
3973     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3974     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3975     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3976
3977     def _real_extract(self, url):
3978         # determine video id from url
3979         m = re.match(self._VALID_URL, url)
3980
3981         numid = re.search(r'documentId=([0-9]+)', url)
3982         if numid:
3983             video_id = numid.group(1)
3984         else:
3985             video_id = m.group('video_id')
3986
3987         # determine title and media streams from webpage
3988         html = self._download_webpage(url, video_id)
3989         title = re.search(self._TITLE, html).group('title')
3990         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3991         if not streams:
3992             assert '"fsk"' in html
3993             raise ExtractorError(u'This video is only available after 8:00 pm')
3994
3995         # choose default media type and highest quality for now
3996         stream = max([s for s in streams if int(s["media_type"]) == 0],
3997                      key=lambda s: int(s["quality"]))
3998
3999         # there's two possibilities: RTMP stream or HTTP download
4000         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4001         if stream['rtmp_url']:
4002             self.to_screen(u'RTMP download detected')
4003             assert stream['video_url'].startswith('mp4:')
4004             info["url"] = stream["rtmp_url"]
4005             info["play_path"] = stream['video_url']
4006         else:
4007             assert stream["video_url"].endswith('.mp4')
4008             info["url"] = stream["video_url"]
4009         return [info]
4010
4011 class ZDFIE(InfoExtractor):
4012     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4013     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4014     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4015     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4016     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4017
4018     def _real_extract(self, url):
4019         mobj = re.match(self._VALID_URL, url)
4020         if mobj is None:
4021             raise ExtractorError(u'Invalid URL: %s' % url)
4022         video_id = mobj.group('video_id')
4023
4024         html = self._download_webpage(url, video_id)
4025         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4026         if streams is None:
4027             raise ExtractorError(u'No media url found.')
4028
4029         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4030         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4031         # choose first/default media type and highest quality for now
4032         for s in streams:        #find 300 - dsl1000mbit
4033             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4034                 stream_=s
4035                 break
4036         for s in streams:        #find veryhigh - dsl2000mbit
4037             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4038                 stream_=s
4039                 break
4040         if stream_ is None:
4041             raise ExtractorError(u'No stream found.')
4042
4043         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4044
4045         self.report_extraction(video_id)
4046         mobj = re.search(self._TITLE, html)
4047         if mobj is None:
4048             raise ExtractorError(u'Cannot extract title')
4049         title = unescapeHTML(mobj.group('title'))
4050
4051         mobj = re.search(self._MMS_STREAM, media_link)
4052         if mobj is None:
4053             mobj = re.search(self._RTSP_STREAM, media_link)
4054             if mobj is None:
4055                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4056         mms_url = mobj.group('video_url')
4057
4058         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4059         if mobj is None:
4060             raise ExtractorError(u'Cannot extract extention')
4061         ext = mobj.group('ext')
4062
4063         return [{'id': video_id,
4064                  'url': mms_url,
4065                  'title': title,
4066                  'ext': ext
4067                  }]
4068
4069 class TumblrIE(InfoExtractor):
4070     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4071
4072     def _real_extract(self, url):
4073         m_url = re.match(self._VALID_URL, url)
4074         video_id = m_url.group('id')
4075         blog = m_url.group('blog_name')
4076
4077         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4078         webpage = self._download_webpage(url, video_id)
4079
4080         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4081         video = re.search(re_video, webpage)
4082         if video is None:
4083            raise ExtractorError(u'Unable to extract video')
4084         video_url = video.group('video_url')
4085         ext = video.group('ext')
4086
4087         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4088             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4089         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4090
4091         # The only place where you can get a title, it's not complete,
4092         # but searching in other places doesn't work for all videos
4093         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4094             webpage, u'title', flags=re.DOTALL)
4095
4096         return [{'id': video_id,
4097                  'url': video_url,
4098                  'title': video_title,
4099                  'thumbnail': video_thumbnail,
4100                  'ext': ext
4101                  }]
4102
4103 class BandcampIE(InfoExtractor):
4104     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4105
4106     def _real_extract(self, url):
4107         mobj = re.match(self._VALID_URL, url)
4108         title = mobj.group('title')
4109         webpage = self._download_webpage(url, title)
4110         # We get the link to the free download page
4111         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4112         if m_download is None:
4113             raise ExtractorError(u'No free songs found')
4114
4115         download_link = m_download.group(1)
4116         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4117                        webpage, re.MULTILINE|re.DOTALL).group('id')
4118
4119         download_webpage = self._download_webpage(download_link, id,
4120                                                   'Downloading free downloads page')
4121         # We get the dictionary of the track from some javascrip code
4122         info = re.search(r'items: (.*?),$',
4123                          download_webpage, re.MULTILINE).group(1)
4124         info = json.loads(info)[0]
4125         # We pick mp3-320 for now, until format selection can be easily implemented.
4126         mp3_info = info[u'downloads'][u'mp3-320']
4127         # If we try to use this url it says the link has expired
4128         initial_url = mp3_info[u'url']
4129         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4130         m_url = re.match(re_url, initial_url)
4131         #We build the url we will use to get the final track url
4132         # This url is build in Bandcamp in the script download_bunde_*.js
4133         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4134         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4135         # If we could correctly generate the .rand field the url would be
4136         #in the "download_url" key
4137         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4138
4139         track_info = {'id':id,
4140                       'title' : info[u'title'],
4141                       'ext' :   'mp3',
4142                       'url' :   final_url,
4143                       'thumbnail' : info[u'thumb_url'],
4144                       'uploader' :  info[u'artist']
4145                       }
4146
4147         return [track_info]
4148
4149 class RedTubeIE(InfoExtractor):
4150     """Information Extractor for redtube"""
4151     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4152
4153     def _real_extract(self,url):
4154         mobj = re.match(self._VALID_URL, url)
4155         if mobj is None:
4156             raise ExtractorError(u'Invalid URL: %s' % url)
4157
4158         video_id = mobj.group('id')
4159         video_extension = 'mp4'
4160         webpage = self._download_webpage(url, video_id)
4161
4162         self.report_extraction(video_id)
4163
4164         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4165             webpage, u'video URL')
4166
4167         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4168             webpage, u'title')
4169
4170         return [{
4171             'id':       video_id,
4172             'url':      video_url,
4173             'ext':      video_extension,
4174             'title':    video_title,
4175         }]
4176
4177 class InaIE(InfoExtractor):
4178     """Information Extractor for Ina.fr"""
4179     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4180
4181     def _real_extract(self,url):
4182         mobj = re.match(self._VALID_URL, url)
4183
4184         video_id = mobj.group('id')
4185         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4186         video_extension = 'mp4'
4187         webpage = self._download_webpage(mrss_url, video_id)
4188
4189         self.report_extraction(video_id)
4190
4191         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4192             webpage, u'video URL')
4193
4194         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4195             webpage, u'title')
4196
4197         return [{
4198             'id':       video_id,
4199             'url':      video_url,
4200             'ext':      video_extension,
4201             'title':    video_title,
4202         }]
4203
4204 class HowcastIE(InfoExtractor):
4205     """Information Extractor for Howcast.com"""
4206     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4207
4208     def _real_extract(self, url):
4209         mobj = re.match(self._VALID_URL, url)
4210
4211         video_id = mobj.group('id')
4212         webpage_url = 'http://www.howcast.com/videos/' + video_id
4213         webpage = self._download_webpage(webpage_url, video_id)
4214
4215         self.report_extraction(video_id)
4216
4217         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4218             webpage, u'video URL')
4219
4220         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4221             webpage, u'title')
4222
4223         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4224             webpage, u'description', fatal=False)
4225
4226         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4227             webpage, u'thumbnail', fatal=False)
4228
4229         return [{
4230             'id':       video_id,
4231             'url':      video_url,
4232             'ext':      'mp4',
4233             'title':    video_title,
4234             'description': video_description,
4235             'thumbnail': thumbnail,
4236         }]
4237
4238 class VineIE(InfoExtractor):
4239     """Information Extractor for Vine.co"""
4240     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4241
4242     def _real_extract(self, url):
4243         mobj = re.match(self._VALID_URL, url)
4244
4245         video_id = mobj.group('id')
4246         webpage_url = 'https://vine.co/v/' + video_id
4247         webpage = self._download_webpage(webpage_url, video_id)
4248
4249         self.report_extraction(video_id)
4250
4251         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4252             webpage, u'video URL')
4253
4254         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4255             webpage, u'title')
4256
4257         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4258             webpage, u'thumbnail', fatal=False)
4259
4260         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4261             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4262
4263         return [{
4264             'id':        video_id,
4265             'url':       video_url,
4266             'ext':       'mp4',
4267             'title':     video_title,
4268             'thumbnail': thumbnail,
4269             'uploader':  uploader,
4270         }]
4271
4272 class FlickrIE(InfoExtractor):
4273     """Information Extractor for Flickr videos"""
4274     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4275
4276     def _real_extract(self, url):
4277         mobj = re.match(self._VALID_URL, url)
4278
4279         video_id = mobj.group('id')
4280         video_uploader_id = mobj.group('uploader_id')
4281         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4282         webpage = self._download_webpage(webpage_url, video_id)
4283
4284         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4285
4286         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4287         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4288
4289         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4290             first_xml, u'node_id')
4291
4292         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4293         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4294
4295         self.report_extraction(video_id)
4296
4297         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4298         if mobj is None:
4299             raise ExtractorError(u'Unable to extract video url')
4300         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4301
4302         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4303             webpage, u'video title')
4304
4305         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4306             webpage, u'description', fatal=False)
4307
4308         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4309             webpage, u'thumbnail', fatal=False)
4310
4311         return [{
4312             'id':          video_id,
4313             'url':         video_url,
4314             'ext':         'mp4',
4315             'title':       video_title,
4316             'description': video_description,
4317             'thumbnail':   thumbnail,
4318             'uploader_id': video_uploader_id,
4319         }]
4320
4321 class TeamcocoIE(InfoExtractor):
4322     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4323
4324     def _real_extract(self, url):
4325         mobj = re.match(self._VALID_URL, url)
4326         if mobj is None:
4327             raise ExtractorError(u'Invalid URL: %s' % url)
4328         url_title = mobj.group('url_title')
4329         webpage = self._download_webpage(url, url_title)
4330
4331         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4332             webpage, u'video id')
4333
4334         self.report_extraction(video_id)
4335
4336         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4337             webpage, u'title')
4338
4339         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4340             webpage, u'thumbnail', fatal=False)
4341
4342         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4343             webpage, u'description', fatal=False)
4344
4345         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4346         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4347
4348         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4349             data, u'video URL')
4350
4351         return [{
4352             'id':          video_id,
4353             'url':         video_url,
4354             'ext':         'mp4',
4355             'title':       video_title,
4356             'thumbnail':   thumbnail,
4357             'description': video_description,
4358         }]
4359
4360 class XHamsterIE(InfoExtractor):
4361     """Information Extractor for xHamster"""
4362     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4363
4364     def _real_extract(self,url):
4365         mobj = re.match(self._VALID_URL, url)
4366
4367         video_id = mobj.group('id')
4368         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4369         webpage = self._download_webpage(mrss_url, video_id)
4370
4371         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4372         if mobj is None:
4373             raise ExtractorError(u'Unable to extract media URL')
4374         if len(mobj.group('server')) == 0:
4375             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4376         else:
4377             video_url = mobj.group('server')+'/key='+mobj.group('file')
4378         video_extension = video_url.split('.')[-1]
4379
4380         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4381             webpage, u'title')
4382
4383         # Can't see the description anywhere in the UI
4384         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4385         #     webpage, u'description', fatal=False)
4386         # if video_description: video_description = unescapeHTML(video_description)
4387
4388         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4389         if mobj:
4390             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4391         else:
4392             video_upload_date = None
4393             self._downloader.report_warning(u'Unable to extract upload date')
4394
4395         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4396             webpage, u'uploader id', default=u'anonymous')
4397
4398         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4399             webpage, u'thumbnail', fatal=False)
4400
4401         return [{
4402             'id':       video_id,
4403             'url':      video_url,
4404             'ext':      video_extension,
4405             'title':    video_title,
4406             # 'description': video_description,
4407             'upload_date': video_upload_date,
4408             'uploader_id': video_uploader_id,
4409             'thumbnail': video_thumbnail
4410         }]
4411
4412 class HypemIE(InfoExtractor):
4413     """Information Extractor for hypem"""
4414     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4415
4416     def _real_extract(self, url):
4417         mobj = re.match(self._VALID_URL, url)
4418         if mobj is None:
4419             raise ExtractorError(u'Invalid URL: %s' % url)
4420         track_id = mobj.group(1)
4421
4422         data = { 'ax': 1, 'ts': time.time() }
4423         data_encoded = compat_urllib_parse.urlencode(data)
4424         complete_url = url + "?" + data_encoded
4425         request = compat_urllib_request.Request(complete_url)
4426         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4427         cookie = urlh.headers.get('Set-Cookie', '')
4428
4429         self.report_extraction(track_id)
4430
4431         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4432             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4433         try:
4434             track_list = json.loads(html_tracks)
4435             track = track_list[u'tracks'][0]
4436         except ValueError:
4437             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4438
4439         key = track[u"key"]
4440         track_id = track[u"id"]
4441         artist = track[u"artist"]
4442         title = track[u"song"]
4443
4444         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4445         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4446         request.add_header('cookie', cookie)
4447         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4448         try:
4449             song_data = json.loads(song_data_json)
4450         except ValueError:
4451             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4452         final_url = song_data[u"url"]
4453
4454         return [{
4455             'id':       track_id,
4456             'url':      final_url,
4457             'ext':      "mp3",
4458             'title':    title,
4459             'artist':   artist,
4460         }]
4461
4462 class Vbox7IE(InfoExtractor):
4463     """Information Extractor for Vbox7"""
4464     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4465
4466     def _real_extract(self,url):
4467         mobj = re.match(self._VALID_URL, url)
4468         if mobj is None:
4469             raise ExtractorError(u'Invalid URL: %s' % url)
4470         video_id = mobj.group(1)
4471
4472         redirect_page, urlh = self._download_webpage_handle(url, video_id)
4473         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4474         redirect_url = urlh.geturl() + new_location
4475         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4476
4477         title = self._html_search_regex(r'<title>(.*)</title>',
4478             webpage, u'title').split('/')[0].strip()
4479
4480         ext = "flv"
4481         info_url = "http://vbox7.com/play/magare.do"
4482         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4483         info_request = compat_urllib_request.Request(info_url, data)
4484         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4485         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4486         if info_response is None:
4487             raise ExtractorError(u'Unable to extract the media url')
4488         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4489
4490         return [{
4491             'id':        video_id,
4492             'url':       final_url,
4493             'ext':       ext,
4494             'title':     title,
4495             'thumbnail': thumbnail_url,
4496         }]
4497
4498 class GametrailersIE(InfoExtractor):
4499     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4500
4501     def _real_extract(self, url):
4502         mobj = re.match(self._VALID_URL, url)
4503         if mobj is None:
4504             raise ExtractorError(u'Invalid URL: %s' % url)
4505         video_id = mobj.group('id')
4506         video_type = mobj.group('type')
4507         webpage = self._download_webpage(url, video_id)
4508         if video_type == 'full-episodes':
4509             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4510         else:
4511             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4512         mgid = self._search_regex(mgid_re, webpage, u'mgid')
4513         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4514
4515         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4516                                            video_id, u'Downloading video info')
4517         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4518                                                video_id, u'Downloading video urls info')
4519
4520         self.report_extraction(video_id)
4521         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4522                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4523                       <image>.*
4524                         <url>(?P<thumb>.*?)</url>.*
4525                       </image>'''
4526
4527         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4528         if m_info is None:
4529             raise ExtractorError(u'Unable to extract video info')
4530         video_title = m_info.group('title')
4531         video_description = m_info.group('description')
4532         video_thumb = m_info.group('thumb')
4533
4534         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4535         if m_urls is None or len(m_urls) == 0:
4536             raise ExtractError(u'Unable to extrat video url')
4537         # They are sorted from worst to best quality
4538         video_url = m_urls[-1].group('url')
4539
4540         return {'url':         video_url,
4541                 'id':          video_id,
4542                 'title':       video_title,
4543                 # Videos are actually flv not mp4
4544                 'ext':         'flv',
4545                 'thumbnail':   video_thumb,
4546                 'description': video_description,
4547                 }
4548
4549 def gen_extractors():
4550     """ Return a list of an instance of every supported extractor.
4551     The order does matter; the first extractor matched is the one handling the URL.
4552     """
4553     return [
4554         YoutubePlaylistIE(),
4555         YoutubeChannelIE(),
4556         YoutubeUserIE(),
4557         YoutubeSearchIE(),
4558         YoutubeIE(),
4559         MetacafeIE(),
4560         DailymotionIE(),
4561         GoogleSearchIE(),
4562         PhotobucketIE(),
4563         YahooIE(),
4564         YahooSearchIE(),
4565         DepositFilesIE(),
4566         FacebookIE(),
4567         BlipTVIE(),
4568         BlipTVUserIE(),
4569         VimeoIE(),
4570         MyVideoIE(),
4571         ComedyCentralIE(),
4572         EscapistIE(),
4573         CollegeHumorIE(),
4574         XVideosIE(),
4575         SoundcloudSetIE(),
4576         SoundcloudIE(),
4577         InfoQIE(),
4578         MixcloudIE(),
4579         StanfordOpenClassroomIE(),
4580         MTVIE(),
4581         YoukuIE(),
4582         XNXXIE(),
4583         YouJizzIE(),
4584         PornotubeIE(),
4585         YouPornIE(),
4586         GooglePlusIE(),
4587         ArteTvIE(),
4588         NBAIE(),
4589         WorldStarHipHopIE(),
4590         JustinTVIE(),
4591         FunnyOrDieIE(),
4592         SteamIE(),
4593         UstreamIE(),
4594         RBMARadioIE(),
4595         EightTracksIE(),
4596         KeekIE(),
4597         TEDIE(),
4598         MySpassIE(),
4599         SpiegelIE(),
4600         LiveLeakIE(),
4601         ARDIE(),
4602         ZDFIE(),
4603         TumblrIE(),
4604         BandcampIE(),
4605         RedTubeIE(),
4606         InaIE(),
4607         HowcastIE(),
4608         VineIE(),
4609         FlickrIE(),
4610         TeamcocoIE(),
4611         XHamsterIE(),
4612         HypemIE(),
4613         Vbox7IE(),
4614         GametrailersIE(),
4615         GenericIE()
4616     ]
4617
4618 def get_info_extractor(ie_name):
4619     """Returns the info extractor class with the given ie_name"""
4620     return globals()[ie_name+'IE']