youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 195         """
 196         Perform a regex search on the given string, using a single or a list of
 197         patterns returning the first matching group.
 198         In case of failure return a default value or raise a WARNING or a
 199         ExtractorError, depending on fatal, specifying the field name.
 200         """
 201         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 202             mobj = re.search(pattern, string, flags)
 203         else:
 204             for p in pattern:
 205                 mobj = re.search(p, string, flags)
 206                 if mobj: break
 207
 208         if sys.stderr.isatty() and os.name != 'nt':
 209             _name = u'\033[0;34m%s\033[0m' % name
 210         else:
 211             _name = name
 212
 213         if mobj:
 214             # return the first matching group
 215             return next(g for g in mobj.groups() if g is not None)
 216         elif default is not None:
 217             return default
 218         elif fatal:
 219             raise ExtractorError(u'Unable to extract %s; '
 220                 u'please report this issue on GitHub.' % _name)
 221         else:
 222             self._downloader.report_warning(u'unable to extract %s; '
 223                 u'please report this issue on GitHub.' % _name)
 224             return None
 225
 226 class SearchInfoExtractor(InfoExtractor):
 227     """
 228     Base class for paged search queries extractors.
 229     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 230     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 231     """
 232
 233     @classmethod
 234     def _make_valid_url(cls):
 235         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 236
 237     @classmethod
 238     def suitable(cls, url):
 239         return re.match(cls._make_valid_url(), url) is not None
 240
 241     def _real_extract(self, query):
 242         mobj = re.match(self._make_valid_url(), query)
 243         if mobj is None:
 244             raise ExtractorError(u'Invalid search query "%s"' % query)
 245
 246         prefix = mobj.group('prefix')
 247         query = mobj.group('query')
 248         if prefix == '':
 249             return self._get_n_results(query, 1)
 250         elif prefix == 'all':
 251             return self._get_n_results(query, self._MAX_RESULTS)
 252         else:
 253             n = int(prefix)
 254             if n <= 0:
 255                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 256             elif n > self._MAX_RESULTS:
 257                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 258                 n = self._MAX_RESULTS
 259             return self._get_n_results(query, n)
 260
 261     def _get_n_results(self, query, n):
 262         """Get a specified number of results for a query"""
 263         raise NotImplementedError("This method must be implemented by sublclasses")
 264
 265
 266 class YoutubeIE(InfoExtractor):
 267     """Information extractor for youtube.com."""
 268
 269     _VALID_URL = r"""^
 270                      (
 271                          (?:https?://)?                                       # http(s):// (optional)
 272                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 273                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 274                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 275                          (?:                                                  # the various things that can precede the ID:
 276                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 277                              |(?:                                             # or the v= param in all its forms
 278                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 279                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 280                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 281                                  v=
 282                              )
 283                          )?                                                   # optional -> youtube.com/xxxx is OK
 284                      )?                                                       # all until now is optional -> you can pass the naked ID
 285                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 286                      (?(1).+)?                                                # if we found the ID, everything can follow
 287                      $"""
 288     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 289     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 290     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 291     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 292     _NETRC_MACHINE = 'youtube'
 293     # Listed in order of quality
 294     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 295     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 296     _video_extensions = {
 297         '13': '3gp',
 298         '17': 'mp4',
 299         '18': 'mp4',
 300         '22': 'mp4',
 301         '37': 'mp4',
 302         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 303         '43': 'webm',
 304         '44': 'webm',
 305         '45': 'webm',
 306         '46': 'webm',
 307     }
 308     _video_dimensions = {
 309         '5': '240x400',
 310         '6': '???',
 311         '13': '???',
 312         '17': '144x176',
 313         '18': '360x640',
 314         '22': '720x1280',
 315         '34': '360x640',
 316         '35': '480x854',
 317         '37': '1080x1920',
 318         '38': '3072x4096',
 319         '43': '360x640',
 320         '44': '480x854',
 321         '45': '720x1280',
 322         '46': '1080x1920',
 323     }
 324     IE_NAME = u'youtube'
 325
 326     @classmethod
 327     def suitable(cls, url):
 328         """Receives a URL and returns True if suitable for this IE."""
 329         if YoutubePlaylistIE.suitable(url): return False
 330         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 331
 332     def report_lang(self):
 333         """Report attempt to set language."""
 334         self.to_screen(u'Setting language')
 335
 336     def report_login(self):
 337         """Report attempt to log in."""
 338         self.to_screen(u'Logging in')
 339
 340     def report_video_webpage_download(self, video_id):
 341         """Report attempt to download video webpage."""
 342         self.to_screen(u'%s: Downloading video webpage' % video_id)
 343
 344     def report_video_info_webpage_download(self, video_id):
 345         """Report attempt to download video info webpage."""
 346         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 347
 348     def report_video_subtitles_download(self, video_id):
 349         """Report attempt to download video info webpage."""
 350         self.to_screen(u'%s: Checking available subtitles' % video_id)
 351
 352     def report_video_subtitles_request(self, video_id, sub_lang, format):
 353         """Report attempt to download video info webpage."""
 354         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 355
 356     def report_video_subtitles_available(self, video_id, sub_lang_list):
 357         """Report available subtitles."""
 358         sub_lang = ",".join(list(sub_lang_list.keys()))
 359         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 360
 361     def report_information_extraction(self, video_id):
 362         """Report attempt to extract video information."""
 363         self.to_screen(u'%s: Extracting video information' % video_id)
 364
 365     def report_unavailable_format(self, video_id, format):
 366         """Report extracted video URL."""
 367         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 368
 369     def report_rtmp_download(self):
 370         """Indicate the download will use the RTMP protocol."""
 371         self.to_screen(u'RTMP download detected')
 372
 373     def _get_available_subtitles(self, video_id):
 374         self.report_video_subtitles_download(video_id)
 375         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 376         try:
 377             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 378         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 379             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 380         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 381         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 382         if not sub_lang_list:
 383             return (u'video doesn\'t have subtitles', None)
 384         return sub_lang_list
 385
 386     def _list_available_subtitles(self, video_id):
 387         sub_lang_list = self._get_available_subtitles(video_id)
 388         self.report_video_subtitles_available(video_id, sub_lang_list)
 389
 390     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 391         """
 392         Return tuple:
 393         (error_message, sub_lang, sub)
 394         """
 395         self.report_video_subtitles_request(video_id, sub_lang, format)
 396         params = compat_urllib_parse.urlencode({
 397             'lang': sub_lang,
 398             'name': sub_name,
 399             'v': video_id,
 400             'fmt': format,
 401         })
 402         url = 'http://www.youtube.com/api/timedtext?' + params
 403         try:
 404             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 405         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 406             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 407         if not sub:
 408             return (u'Did not fetch video subtitles', None, None)
 409         return (None, sub_lang, sub)
 410
 411     def _request_automatic_caption(self, video_id, webpage):
 412         """We need the webpage for getting the captions url, pass it as an
 413            argument to speed up the process."""
 414         sub_lang = self._downloader.params.get('subtitleslang')
 415         sub_format = self._downloader.params.get('subtitlesformat')
 416         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 417         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 418         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 419         if mobj is None:
 420             return [(err_msg, None, None)]
 421         player_config = json.loads(mobj.group(1))
 422         try:
 423             args = player_config[u'args']
 424             caption_url = args[u'ttsurl']
 425             timestamp = args[u'timestamp']
 426             params = compat_urllib_parse.urlencode({
 427                 'lang': 'en',
 428                 'tlang': sub_lang,
 429                 'fmt': sub_format,
 430                 'ts': timestamp,
 431                 'kind': 'asr',
 432             })
 433             subtitles_url = caption_url + '&' + params
 434             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 435             return [(None, sub_lang, sub)]
 436         except KeyError:
 437             return [(err_msg, None, None)]
 438
 439     def _extract_subtitle(self, video_id):
 440         """
 441         Return a list with a tuple:
 442         [(error_message, sub_lang, sub)]
 443         """
 444         sub_lang_list = self._get_available_subtitles(video_id)
 445         sub_format = self._downloader.params.get('subtitlesformat')
 446         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 447             return [(sub_lang_list[0], None, None)]
 448         if self._downloader.params.get('subtitleslang', False):
 449             sub_lang = self._downloader.params.get('subtitleslang')
 450         elif 'en' in sub_lang_list:
 451             sub_lang = 'en'
 452         else:
 453             sub_lang = list(sub_lang_list.keys())[0]
 454         if not sub_lang in sub_lang_list:
 455             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 456
 457         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 458         return [subtitle]
 459
 460     def _extract_all_subtitles(self, video_id):
 461         sub_lang_list = self._get_available_subtitles(video_id)
 462         sub_format = self._downloader.params.get('subtitlesformat')
 463         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 464             return [(sub_lang_list[0], None, None)]
 465         subtitles = []
 466         for sub_lang in sub_lang_list:
 467             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 468             subtitles.append(subtitle)
 469         return subtitles
 470
 471     def _print_formats(self, formats):
 472         print('Available formats:')
 473         for x in formats:
 474             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 475
 476     def _real_initialize(self):
 477         if self._downloader is None:
 478             return
 479
 480         username = None
 481         password = None
 482         downloader_params = self._downloader.params
 483
 484         # Attempt to use provided username and password or .netrc data
 485         if downloader_params.get('username', None) is not None:
 486             username = downloader_params['username']
 487             password = downloader_params['password']
 488         elif downloader_params.get('usenetrc', False):
 489             try:
 490                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 491                 if info is not None:
 492                     username = info[0]
 493                     password = info[2]
 494                 else:
 495                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 496             except (IOError, netrc.NetrcParseError) as err:
 497                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 498                 return
 499
 500         # Set language
 501         request = compat_urllib_request.Request(self._LANG_URL)
 502         try:
 503             self.report_lang()
 504             compat_urllib_request.urlopen(request).read()
 505         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 506             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 507             return
 508
 509         # No authentication to be performed
 510         if username is None:
 511             return
 512
 513         request = compat_urllib_request.Request(self._LOGIN_URL)
 514         try:
 515             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 516         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 517             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 518             return
 519
 520         galx = None
 521         dsh = None
 522         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 523         if match:
 524           galx = match.group(1)
 525
 526         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 527         if match:
 528           dsh = match.group(1)
 529
 530         # Log in
 531         login_form_strs = {
 532                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 533                 u'Email': username,
 534                 u'GALX': galx,
 535                 u'Passwd': password,
 536                 u'PersistentCookie': u'yes',
 537                 u'_utf8': u'霱',
 538                 u'bgresponse': u'js_disabled',
 539                 u'checkConnection': u'',
 540                 u'checkedDomains': u'youtube',
 541                 u'dnConn': u'',
 542                 u'dsh': dsh,
 543                 u'pstMsg': u'0',
 544                 u'rmShown': u'1',
 545                 u'secTok': u'',
 546                 u'signIn': u'Sign in',
 547                 u'timeStmp': u'',
 548                 u'service': u'youtube',
 549                 u'uilel': u'3',
 550                 u'hl': u'en_US',
 551         }
 552         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 553         # chokes on unicode
 554         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 555         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 556         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 557         try:
 558             self.report_login()
 559             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 560             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 561                 self._downloader.report_warning(u'unable to log in: bad username or password')
 562                 return
 563         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 564             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 565             return
 566
 567         # Confirm age
 568         age_form = {
 569                 'next_url':     '/',
 570                 'action_confirm':   'Confirm',
 571                 }
 572         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 573         try:
 574             self.report_age_confirmation()
 575             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 576         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 577             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 578
 579     def _extract_id(self, url):
 580         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 581         if mobj is None:
 582             raise ExtractorError(u'Invalid URL: %s' % url)
 583         video_id = mobj.group(2)
 584         return video_id
 585
 586     def _real_extract(self, url):
 587         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 588         mobj = re.search(self._NEXT_URL_RE, url)
 589         if mobj:
 590             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 591         video_id = self._extract_id(url)
 592
 593         # Get video webpage
 594         self.report_video_webpage_download(video_id)
 595         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 596         request = compat_urllib_request.Request(url)
 597         try:
 598             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 599         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 600             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 601
 602         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 603
 604         # Attempt to extract SWF player URL
 605         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 606         if mobj is not None:
 607             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 608         else:
 609             player_url = None
 610
 611         # Get video info
 612         self.report_video_info_webpage_download(video_id)
 613         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 614             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 615                     % (video_id, el_type))
 616             video_info_webpage = self._download_webpage(video_info_url, video_id,
 617                                     note=False,
 618                                     errnote='unable to download video info webpage')
 619             video_info = compat_parse_qs(video_info_webpage)
 620             if 'token' in video_info:
 621                 break
 622         if 'token' not in video_info:
 623             if 'reason' in video_info:
 624                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 625             else:
 626                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 627
 628         # Check for "rental" videos
 629         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 630             raise ExtractorError(u'"rental" videos not supported')
 631
 632         # Start extracting information
 633         self.report_information_extraction(video_id)
 634
 635         # uploader
 636         if 'author' not in video_info:
 637             raise ExtractorError(u'Unable to extract uploader name')
 638         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 639
 640         # uploader_id
 641         video_uploader_id = None
 642         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 643         if mobj is not None:
 644             video_uploader_id = mobj.group(1)
 645         else:
 646             self._downloader.report_warning(u'unable to extract uploader nickname')
 647
 648         # title
 649         if 'title' not in video_info:
 650             raise ExtractorError(u'Unable to extract video title')
 651         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 652
 653         # thumbnail image
 654         if 'thumbnail_url' not in video_info:
 655             self._downloader.report_warning(u'unable to extract video thumbnail')
 656             video_thumbnail = ''
 657         else:   # don't panic if we can't find it
 658             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 659
 660         # upload date
 661         upload_date = None
 662         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 663         if mobj is not None:
 664             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 665             upload_date = unified_strdate(upload_date)
 666
 667         # description
 668         video_description = get_element_by_id("eow-description", video_webpage)
 669         if video_description:
 670             video_description = clean_html(video_description)
 671         else:
 672             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 673             if fd_mobj:
 674                 video_description = unescapeHTML(fd_mobj.group(1))
 675             else:
 676                 video_description = u''
 677
 678         # subtitles
 679         video_subtitles = None
 680
 681         if self._downloader.params.get('writesubtitles', False):
 682             video_subtitles = self._extract_subtitle(video_id)
 683             if video_subtitles:
 684                 (sub_error, sub_lang, sub) = video_subtitles[0]
 685                 if sub_error:
 686                     # We try with the automatic captions
 687                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 688                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 689                     if sub is not None:
 690                         pass
 691                     else:
 692                         # We report the original error
 693                         self._downloader.report_error(sub_error)
 694
 695         if self._downloader.params.get('allsubtitles', False):
 696             video_subtitles = self._extract_all_subtitles(video_id)
 697             for video_subtitle in video_subtitles:
 698                 (sub_error, sub_lang, sub) = video_subtitle
 699                 if sub_error:
 700                     self._downloader.report_error(sub_error)
 701
 702         if self._downloader.params.get('listsubtitles', False):
 703             sub_lang_list = self._list_available_subtitles(video_id)
 704             return
 705
 706         if 'length_seconds' not in video_info:
 707             self._downloader.report_warning(u'unable to extract video duration')
 708             video_duration = ''
 709         else:
 710             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 711
 712         # token
 713         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 714
 715         # Decide which formats to download
 716         req_format = self._downloader.params.get('format', None)
 717
 718         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 719             self.report_rtmp_download()
 720             video_url_list = [(None, video_info['conn'][0])]
 721         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 722             url_map = {}
 723             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 724                 url_data = compat_parse_qs(url_data_str)
 725                 if 'itag' in url_data and 'url' in url_data:
 726                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 727                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 728                     url_map[url_data['itag'][0]] = url
 729
 730             format_limit = self._downloader.params.get('format_limit', None)
 731             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 732             if format_limit is not None and format_limit in available_formats:
 733                 format_list = available_formats[available_formats.index(format_limit):]
 734             else:
 735                 format_list = available_formats
 736             existing_formats = [x for x in format_list if x in url_map]
 737             if len(existing_formats) == 0:
 738                 raise ExtractorError(u'no known formats available for video')
 739             if self._downloader.params.get('listformats', None):
 740                 self._print_formats(existing_formats)
 741                 return
 742             if req_format is None or req_format == 'best':
 743                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 744             elif req_format == 'worst':
 745                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 746             elif req_format in ('-1', 'all'):
 747                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 748             else:
 749                 # Specific formats. We pick the first in a slash-delimeted sequence.
 750                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 751                 req_formats = req_format.split('/')
 752                 video_url_list = None
 753                 for rf in req_formats:
 754                     if rf in url_map:
 755                         video_url_list = [(rf, url_map[rf])]
 756                         break
 757                 if video_url_list is None:
 758                     raise ExtractorError(u'requested format not available')
 759         else:
 760             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 761
 762         results = []
 763         for format_param, video_real_url in video_url_list:
 764             # Extension
 765             video_extension = self._video_extensions.get(format_param, 'flv')
 766
 767             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 768                                               self._video_dimensions.get(format_param, '???'))
 769
 770             results.append({
 771                 'id':       video_id,
 772                 'url':      video_real_url,
 773                 'uploader': video_uploader,
 774                 'uploader_id': video_uploader_id,
 775                 'upload_date':  upload_date,
 776                 'title':    video_title,
 777                 'ext':      video_extension,
 778                 'format':   video_format,
 779                 'thumbnail':    video_thumbnail,
 780                 'description':  video_description,
 781                 'player_url':   player_url,
 782                 'subtitles':    video_subtitles,
 783                 'duration':     video_duration
 784             })
 785         return results
 786
 787
 788 class MetacafeIE(InfoExtractor):
 789     """Information Extractor for metacafe.com."""
 790
 791     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 792     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 793     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 794     IE_NAME = u'metacafe'
 795
 796     def report_disclaimer(self):
 797         """Report disclaimer retrieval."""
 798         self.to_screen(u'Retrieving disclaimer')
 799
 800     def _real_initialize(self):
 801         # Retrieve disclaimer
 802         request = compat_urllib_request.Request(self._DISCLAIMER)
 803         try:
 804             self.report_disclaimer()
 805             disclaimer = compat_urllib_request.urlopen(request).read()
 806         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 807             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 808
 809         # Confirm age
 810         disclaimer_form = {
 811             'filters': '0',
 812             'submit': "Continue - I'm over 18",
 813             }
 814         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 815         try:
 816             self.report_age_confirmation()
 817             disclaimer = compat_urllib_request.urlopen(request).read()
 818         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 819             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 820
 821     def _real_extract(self, url):
 822         # Extract id and simplified title from URL
 823         mobj = re.match(self._VALID_URL, url)
 824         if mobj is None:
 825             raise ExtractorError(u'Invalid URL: %s' % url)
 826
 827         video_id = mobj.group(1)
 828
 829         # Check if video comes from YouTube
 830         mobj2 = re.match(r'^yt-(.*)$', video_id)
 831         if mobj2 is not None:
 832             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 833
 834         # Retrieve video webpage to extract further information
 835         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 836
 837         # Extract URL, uploader and title from webpage
 838         self.report_extraction(video_id)
 839         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 840         if mobj is not None:
 841             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 842             video_extension = mediaURL[-3:]
 843
 844             # Extract gdaKey if available
 845             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 846             if mobj is None:
 847                 video_url = mediaURL
 848             else:
 849                 gdaKey = mobj.group(1)
 850                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 851         else:
 852             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 853             if mobj is None:
 854                 raise ExtractorError(u'Unable to extract media URL')
 855             vardict = compat_parse_qs(mobj.group(1))
 856             if 'mediaData' not in vardict:
 857                 raise ExtractorError(u'Unable to extract media URL')
 858             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 859             if mobj is None:
 860                 raise ExtractorError(u'Unable to extract media URL')
 861             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 862             video_extension = mediaURL[-3:]
 863             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 864
 865         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 866         if mobj is None:
 867             raise ExtractorError(u'Unable to extract title')
 868         video_title = mobj.group(1).decode('utf-8')
 869
 870         mobj = re.search(r'submitter=(.*?);', webpage)
 871         if mobj is None:
 872             raise ExtractorError(u'Unable to extract uploader nickname')
 873         video_uploader = mobj.group(1)
 874
 875         return [{
 876             'id':       video_id.decode('utf-8'),
 877             'url':      video_url.decode('utf-8'),
 878             'uploader': video_uploader.decode('utf-8'),
 879             'upload_date':  None,
 880             'title':    video_title,
 881             'ext':      video_extension.decode('utf-8'),
 882         }]
 883
 884 class DailymotionIE(InfoExtractor):
 885     """Information Extractor for Dailymotion"""
 886
 887     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 888     IE_NAME = u'dailymotion'
 889
 890     def _real_extract(self, url):
 891         # Extract id and simplified title from URL
 892         mobj = re.match(self._VALID_URL, url)
 893         if mobj is None:
 894             raise ExtractorError(u'Invalid URL: %s' % url)
 895
 896         video_id = mobj.group(1).split('_')[0].split('?')[0]
 897
 898         video_extension = 'mp4'
 899
 900         # Retrieve video webpage to extract further information
 901         request = compat_urllib_request.Request(url)
 902         request.add_header('Cookie', 'family_filter=off')
 903         webpage = self._download_webpage(request, video_id)
 904
 905         # Extract URL, uploader and title from webpage
 906         self.report_extraction(video_id)
 907         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 908         if mobj is None:
 909             raise ExtractorError(u'Unable to extract media URL')
 910         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 911
 912         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 913             if key in flashvars:
 914                 max_quality = key
 915                 self.to_screen(u'Using %s' % key)
 916                 break
 917         else:
 918             raise ExtractorError(u'Unable to extract video URL')
 919
 920         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 921         if mobj is None:
 922             raise ExtractorError(u'Unable to extract video URL')
 923
 924         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 925
 926         # TODO: support choosing qualities
 927
 928         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 929         if mobj is None:
 930             raise ExtractorError(u'Unable to extract title')
 931         video_title = unescapeHTML(mobj.group('title'))
 932
 933         video_uploader = None
 934         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 935         if mobj is None:
 936             # lookin for official user
 937             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 938             if mobj_official is None:
 939                 self._downloader.report_warning(u'unable to extract uploader nickname')
 940             else:
 941                 video_uploader = mobj_official.group(1)
 942         else:
 943             video_uploader = mobj.group(1)
 944
 945         video_upload_date = None
 946         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 947         if mobj is not None:
 948             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 949
 950         return [{
 951             'id':       video_id,
 952             'url':      video_url,
 953             'uploader': video_uploader,
 954             'upload_date':  video_upload_date,
 955             'title':    video_title,
 956             'ext':      video_extension,
 957         }]
 958
 959
 960 class PhotobucketIE(InfoExtractor):
 961     """Information extractor for photobucket.com."""
 962
 963     # TODO: the original _VALID_URL was:
 964     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 965     # Check if it's necessary to keep the old extracion process
 966     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 967     IE_NAME = u'photobucket'
 968
 969     def _real_extract(self, url):
 970         # Extract id from URL
 971         mobj = re.match(self._VALID_URL, url)
 972         if mobj is None:
 973             raise ExtractorError(u'Invalid URL: %s' % url)
 974
 975         video_id = mobj.group('id')
 976
 977         video_extension = mobj.group('ext')
 978
 979         # Retrieve video webpage to extract further information
 980         webpage = self._download_webpage(url, video_id)
 981
 982         # Extract URL, uploader, and title from webpage
 983         self.report_extraction(video_id)
 984         # We try first by looking the javascript code:
 985         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 986         if mobj is not None:
 987             info = json.loads(mobj.group('json'))
 988             return [{
 989                 'id':       video_id,
 990                 'url':      info[u'downloadUrl'],
 991                 'uploader': info[u'username'],
 992                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 993                 'title':    info[u'title'],
 994                 'ext':      video_extension,
 995                 'thumbnail': info[u'thumbUrl'],
 996             }]
 997
 998         # We try looking in other parts of the webpage
 999         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1000             webpage, u'video URL')
1001
1002         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1003         if mobj is None:
1004             raise ExtractorError(u'Unable to extract title')
1005         video_title = mobj.group(1).decode('utf-8')
1006         video_uploader = mobj.group(2).decode('utf-8')
1007
1008         return [{
1009             'id':       video_id.decode('utf-8'),
1010             'url':      video_url.decode('utf-8'),
1011             'uploader': video_uploader,
1012             'upload_date':  None,
1013             'title':    video_title,
1014             'ext':      video_extension.decode('utf-8'),
1015         }]
1016
1017
1018 class YahooIE(InfoExtractor):
1019     """Information extractor for screen.yahoo.com."""
1020     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1021
1022     def _real_extract(self, url):
1023         mobj = re.match(self._VALID_URL, url)
1024         if mobj is None:
1025             raise ExtractorError(u'Invalid URL: %s' % url)
1026         video_id = mobj.group('id')
1027         webpage = self._download_webpage(url, video_id)
1028         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1029
1030         if m_id is None:
1031             # TODO: Check which url parameters are required
1032             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1033             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1034             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1035                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1036                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1037                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1038                         '''
1039             self.report_extraction(video_id)
1040             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1041             if m_info is None:
1042                 raise ExtractorError(u'Unable to extract video info')
1043             video_title = m_info.group('title')
1044             video_description = m_info.group('description')
1045             video_thumb = m_info.group('thumb')
1046             video_date = m_info.group('date')
1047             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1048
1049             # TODO: Find a way to get mp4 videos
1050             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1051             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1052             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1053             video_url = m_rest.group('url')
1054             video_path = m_rest.group('path')
1055             if m_rest is None:
1056                 raise ExtractorError(u'Unable to extract video url')
1057
1058         else: # We have to use a different method if another id is defined
1059             long_id = m_id.group('new_id')
1060             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1061             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1062             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1063             info = json.loads(json_str)
1064             res = info[u'query'][u'results'][u'mediaObj'][0]
1065             stream = res[u'streams'][0]
1066             video_path = stream[u'path']
1067             video_url = stream[u'host']
1068             meta = res[u'meta']
1069             video_title = meta[u'title']
1070             video_description = meta[u'description']
1071             video_thumb = meta[u'thumbnail']
1072             video_date = None # I can't find it
1073
1074         info_dict = {
1075                      'id': video_id,
1076                      'url': video_url,
1077                      'play_path': video_path,
1078                      'title':video_title,
1079                      'description': video_description,
1080                      'thumbnail': video_thumb,
1081                      'upload_date': video_date,
1082                      'ext': 'flv',
1083                      }
1084         return info_dict
1085
1086 class VimeoIE(InfoExtractor):
1087     """Information extractor for vimeo.com."""
1088
1089     # _VALID_URL matches Vimeo URLs
1090     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1091     IE_NAME = u'vimeo'
1092
1093     def _real_extract(self, url, new_video=True):
1094         # Extract ID from URL
1095         mobj = re.match(self._VALID_URL, url)
1096         if mobj is None:
1097             raise ExtractorError(u'Invalid URL: %s' % url)
1098
1099         video_id = mobj.group('id')
1100         if not mobj.group('proto'):
1101             url = 'https://' + url
1102         if mobj.group('direct_link') or mobj.group('pro'):
1103             url = 'https://vimeo.com/' + video_id
1104
1105         # Retrieve video webpage to extract further information
1106         request = compat_urllib_request.Request(url, None, std_headers)
1107         webpage = self._download_webpage(request, video_id)
1108
1109         # Now we begin extracting as much information as we can from what we
1110         # retrieved. First we extract the information common to all extractors,
1111         # and latter we extract those that are Vimeo specific.
1112         self.report_extraction(video_id)
1113
1114         # Extract the config JSON
1115         try:
1116             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1117             config = json.loads(config)
1118         except:
1119             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1120                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1121             else:
1122                 raise ExtractorError(u'Unable to extract info section')
1123
1124         # Extract title
1125         video_title = config["video"]["title"]
1126
1127         # Extract uploader and uploader_id
1128         video_uploader = config["video"]["owner"]["name"]
1129         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1130
1131         # Extract video thumbnail
1132         video_thumbnail = config["video"]["thumbnail"]
1133
1134         # Extract video description
1135         video_description = get_element_by_attribute("itemprop", "description", webpage)
1136         if video_description: video_description = clean_html(video_description)
1137         else: video_description = u''
1138
1139         # Extract upload date
1140         video_upload_date = None
1141         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1142         if mobj is not None:
1143             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1144
1145         # Vimeo specific: extract request signature and timestamp
1146         sig = config['request']['signature']
1147         timestamp = config['request']['timestamp']
1148
1149         # Vimeo specific: extract video codec and quality information
1150         # First consider quality, then codecs, then take everything
1151         # TODO bind to format param
1152         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1153         files = { 'hd': [], 'sd': [], 'other': []}
1154         for codec_name, codec_extension in codecs:
1155             if codec_name in config["video"]["files"]:
1156                 if 'hd' in config["video"]["files"][codec_name]:
1157                     files['hd'].append((codec_name, codec_extension, 'hd'))
1158                 elif 'sd' in config["video"]["files"][codec_name]:
1159                     files['sd'].append((codec_name, codec_extension, 'sd'))
1160                 else:
1161                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1162
1163         for quality in ('hd', 'sd', 'other'):
1164             if len(files[quality]) > 0:
1165                 video_quality = files[quality][0][2]
1166                 video_codec = files[quality][0][0]
1167                 video_extension = files[quality][0][1]
1168                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1169                 break
1170         else:
1171             raise ExtractorError(u'No known codec found')
1172
1173         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1174                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1175
1176         return [{
1177             'id':       video_id,
1178             'url':      video_url,
1179             'uploader': video_uploader,
1180             'uploader_id': video_uploader_id,
1181             'upload_date':  video_upload_date,
1182             'title':    video_title,
1183             'ext':      video_extension,
1184             'thumbnail':    video_thumbnail,
1185             'description':  video_description,
1186         }]
1187
1188
1189 class ArteTvIE(InfoExtractor):
1190     """arte.tv information extractor."""
1191
1192     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1193     _LIVE_URL = r'index-[0-9]+\.html$'
1194
1195     IE_NAME = u'arte.tv'
1196
1197     def fetch_webpage(self, url):
1198         request = compat_urllib_request.Request(url)
1199         try:
1200             self.report_download_webpage(url)
1201             webpage = compat_urllib_request.urlopen(request).read()
1202         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1203             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1204         except ValueError as err:
1205             raise ExtractorError(u'Invalid URL: %s' % url)
1206         return webpage
1207
1208     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1209         page = self.fetch_webpage(url)
1210         mobj = re.search(regex, page, regexFlags)
1211         info = {}
1212
1213         if mobj is None:
1214             raise ExtractorError(u'Invalid URL: %s' % url)
1215
1216         for (i, key, err) in matchTuples:
1217             if mobj.group(i) is None:
1218                 raise ExtractorError(err)
1219             else:
1220                 info[key] = mobj.group(i)
1221
1222         return info
1223
1224     def extractLiveStream(self, url):
1225         video_lang = url.split('/')[-4]
1226         info = self.grep_webpage(
1227             url,
1228             r'src="(.*?/videothek_js.*?\.js)',
1229             0,
1230             [
1231                 (1, 'url', u'Invalid URL: %s' % url)
1232             ]
1233         )
1234         http_host = url.split('/')[2]
1235         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1236         info = self.grep_webpage(
1237             next_url,
1238             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1239                 '(http://.*?\.swf).*?' +
1240                 '(rtmp://.*?)\'',
1241             re.DOTALL,
1242             [
1243                 (1, 'path',   u'could not extract video path: %s' % url),
1244                 (2, 'player', u'could not extract video player: %s' % url),
1245                 (3, 'url',    u'could not extract video url: %s' % url)
1246             ]
1247         )
1248         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1249
1250     def extractPlus7Stream(self, url):
1251         video_lang = url.split('/')[-3]
1252         info = self.grep_webpage(
1253             url,
1254             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1255             0,
1256             [
1257                 (1, 'url', u'Invalid URL: %s' % url)
1258             ]
1259         )
1260         next_url = compat_urllib_parse.unquote(info.get('url'))
1261         info = self.grep_webpage(
1262             next_url,
1263             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1264             0,
1265             [
1266                 (1, 'url', u'Could not find <video> tag: %s' % url)
1267             ]
1268         )
1269         next_url = compat_urllib_parse.unquote(info.get('url'))
1270
1271         info = self.grep_webpage(
1272             next_url,
1273             r'<video id="(.*?)".*?>.*?' +
1274                 '<name>(.*?)</name>.*?' +
1275                 '<dateVideo>(.*?)</dateVideo>.*?' +
1276                 '<url quality="hd">(.*?)</url>',
1277             re.DOTALL,
1278             [
1279                 (1, 'id',    u'could not extract video id: %s' % url),
1280                 (2, 'title', u'could not extract video title: %s' % url),
1281                 (3, 'date',  u'could not extract video date: %s' % url),
1282                 (4, 'url',   u'could not extract video url: %s' % url)
1283             ]
1284         )
1285
1286         return {
1287             'id':           info.get('id'),
1288             'url':          compat_urllib_parse.unquote(info.get('url')),
1289             'uploader':     u'arte.tv',
1290             'upload_date':  unified_strdate(info.get('date')),
1291             'title':        info.get('title').decode('utf-8'),
1292             'ext':          u'mp4',
1293             'format':       u'NA',
1294             'player_url':   None,
1295         }
1296
1297     def _real_extract(self, url):
1298         video_id = url.split('/')[-1]
1299         self.report_extraction(video_id)
1300
1301         if re.search(self._LIVE_URL, video_id) is not None:
1302             self.extractLiveStream(url)
1303             return
1304         else:
1305             info = self.extractPlus7Stream(url)
1306
1307         return [info]
1308
1309
1310 class GenericIE(InfoExtractor):
1311     """Generic last-resort information extractor."""
1312
1313     _VALID_URL = r'.*'
1314     IE_NAME = u'generic'
1315
1316     def report_download_webpage(self, video_id):
1317         """Report webpage download."""
1318         if not self._downloader.params.get('test', False):
1319             self._downloader.report_warning(u'Falling back on generic information extractor.')
1320         super(GenericIE, self).report_download_webpage(video_id)
1321
1322     def report_following_redirect(self, new_url):
1323         """Report information extraction."""
1324         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1325
1326     def _test_redirect(self, url):
1327         """Check if it is a redirect, like url shorteners, in case return the new url."""
1328         class HeadRequest(compat_urllib_request.Request):
1329             def get_method(self):
1330                 return "HEAD"
1331
1332         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1333             """
1334             Subclass the HTTPRedirectHandler to make it use our
1335             HeadRequest also on the redirected URL
1336             """
1337             def redirect_request(self, req, fp, code, msg, headers, newurl):
1338                 if code in (301, 302, 303, 307):
1339                     newurl = newurl.replace(' ', '%20')
1340                     newheaders = dict((k,v) for k,v in req.headers.items()
1341                                       if k.lower() not in ("content-length", "content-type"))
1342                     return HeadRequest(newurl,
1343                                        headers=newheaders,
1344                                        origin_req_host=req.get_origin_req_host(),
1345                                        unverifiable=True)
1346                 else:
1347                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1348
1349         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1350             """
1351             Fallback to GET if HEAD is not allowed (405 HTTP error)
1352             """
1353             def http_error_405(self, req, fp, code, msg, headers):
1354                 fp.read()
1355                 fp.close()
1356
1357                 newheaders = dict((k,v) for k,v in req.headers.items()
1358                                   if k.lower() not in ("content-length", "content-type"))
1359                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1360                                                  headers=newheaders,
1361                                                  origin_req_host=req.get_origin_req_host(),
1362                                                  unverifiable=True))
1363
1364         # Build our opener
1365         opener = compat_urllib_request.OpenerDirector()
1366         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1367                         HTTPMethodFallback, HEADRedirectHandler,
1368                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1369             opener.add_handler(handler())
1370
1371         response = opener.open(HeadRequest(url))
1372         if response is None:
1373             raise ExtractorError(u'Invalid URL protocol')
1374         new_url = response.geturl()
1375
1376         if url == new_url:
1377             return False
1378
1379         self.report_following_redirect(new_url)
1380         return new_url
1381
1382     def _real_extract(self, url):
1383         new_url = self._test_redirect(url)
1384         if new_url: return [self.url_result(new_url)]
1385
1386         video_id = url.split('/')[-1]
1387         try:
1388             webpage = self._download_webpage(url, video_id)
1389         except ValueError as err:
1390             # since this is the last-resort InfoExtractor, if
1391             # this error is thrown, it'll be thrown here
1392             raise ExtractorError(u'Invalid URL: %s' % url)
1393
1394         self.report_extraction(video_id)
1395         # Start with something easy: JW Player in SWFObject
1396         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1397         if mobj is None:
1398             # Broaden the search a little bit
1399             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1400         if mobj is None:
1401             # Broaden the search a little bit: JWPlayer JS loader
1402             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1403         if mobj is None:
1404             raise ExtractorError(u'Invalid URL: %s' % url)
1405
1406         # It's possible that one of the regexes
1407         # matched, but returned an empty group:
1408         if mobj.group(1) is None:
1409             raise ExtractorError(u'Invalid URL: %s' % url)
1410
1411         video_url = compat_urllib_parse.unquote(mobj.group(1))
1412         video_id = os.path.basename(video_url)
1413
1414         # here's a fun little line of code for you:
1415         video_extension = os.path.splitext(video_id)[1][1:]
1416         video_id = os.path.splitext(video_id)[0]
1417
1418         # it's tempting to parse this further, but you would
1419         # have to take into account all the variations like
1420         #   Video Title - Site Name
1421         #   Site Name | Video Title
1422         #   Video Title - Tagline | Site Name
1423         # and so on and so forth; it's just not practical
1424         mobj = re.search(r'<title>(.*)</title>', webpage)
1425         if mobj is None:
1426             raise ExtractorError(u'Unable to extract title')
1427         video_title = mobj.group(1)
1428
1429         # video uploader is domain name
1430         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1431         if mobj is None:
1432             raise ExtractorError(u'Unable to extract title')
1433         video_uploader = mobj.group(1)
1434
1435         return [{
1436             'id':       video_id,
1437             'url':      video_url,
1438             'uploader': video_uploader,
1439             'upload_date':  None,
1440             'title':    video_title,
1441             'ext':      video_extension,
1442         }]
1443
1444
1445 class YoutubeSearchIE(SearchInfoExtractor):
1446     """Information Extractor for YouTube search queries."""
1447     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1448     _MAX_RESULTS = 1000
1449     IE_NAME = u'youtube:search'
1450     _SEARCH_KEY = 'ytsearch'
1451
1452     def report_download_page(self, query, pagenum):
1453         """Report attempt to download search page with given number."""
1454         query = query.decode(preferredencoding())
1455         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1456
1457     def _get_n_results(self, query, n):
1458         """Get a specified number of results for a query"""
1459
1460         video_ids = []
1461         pagenum = 0
1462         limit = n
1463
1464         while (50 * pagenum) < limit:
1465             self.report_download_page(query, pagenum+1)
1466             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1467             request = compat_urllib_request.Request(result_url)
1468             try:
1469                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1470             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1471                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1472             api_response = json.loads(data)['data']
1473
1474             if not 'items' in api_response:
1475                 raise ExtractorError(u'[youtube] No video results')
1476
1477             new_ids = list(video['id'] for video in api_response['items'])
1478             video_ids += new_ids
1479
1480             limit = min(n, api_response['totalItems'])
1481             pagenum += 1
1482
1483         if len(video_ids) > n:
1484             video_ids = video_ids[:n]
1485         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1486         return self.playlist_result(videos, query)
1487
1488
1489 class GoogleSearchIE(SearchInfoExtractor):
1490     """Information Extractor for Google Video search queries."""
1491     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1492     _MAX_RESULTS = 1000
1493     IE_NAME = u'video.google:search'
1494     _SEARCH_KEY = 'gvsearch'
1495
1496     def _get_n_results(self, query, n):
1497         """Get a specified number of results for a query"""
1498
1499         res = {
1500             '_type': 'playlist',
1501             'id': query,
1502             'entries': []
1503         }
1504
1505         for pagenum in itertools.count(1):
1506             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1507             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1508                                              note='Downloading result page ' + str(pagenum))
1509
1510             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1511                 e = {
1512                     '_type': 'url',
1513                     'url': mobj.group(1)
1514                 }
1515                 res['entries'].append(e)
1516
1517             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1518                 return res
1519
1520 class YahooSearchIE(SearchInfoExtractor):
1521     """Information Extractor for Yahoo! Video search queries."""
1522
1523     _MAX_RESULTS = 1000
1524     IE_NAME = u'screen.yahoo:search'
1525     _SEARCH_KEY = 'yvsearch'
1526
1527     def _get_n_results(self, query, n):
1528         """Get a specified number of results for a query"""
1529
1530         res = {
1531             '_type': 'playlist',
1532             'id': query,
1533             'entries': []
1534         }
1535         for pagenum in itertools.count(0):
1536             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1537             webpage = self._download_webpage(result_url, query,
1538                                              note='Downloading results page '+str(pagenum+1))
1539             info = json.loads(webpage)
1540             m = info[u'm']
1541             results = info[u'results']
1542
1543             for (i, r) in enumerate(results):
1544                 if (pagenum * 30) +i >= n:
1545                     break
1546                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1547                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1548                 res['entries'].append(e)
1549             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1550                 break
1551
1552         return res
1553
1554
1555 class YoutubePlaylistIE(InfoExtractor):
1556     """Information Extractor for YouTube playlists."""
1557
1558     _VALID_URL = r"""(?:
1559                         (?:https?://)?
1560                         (?:\w+\.)?
1561                         youtube\.com/
1562                         (?:
1563                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1564                            \? (?:.*?&)*? (?:p|a|list)=
1565                         |  p/
1566                         )
1567                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1568                         .*
1569                      |
1570                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1571                      )"""
1572     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1573     _MAX_RESULTS = 50
1574     IE_NAME = u'youtube:playlist'
1575
1576     @classmethod
1577     def suitable(cls, url):
1578         """Receives a URL and returns True if suitable for this IE."""
1579         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1580
1581     def _real_extract(self, url):
1582         # Extract playlist id
1583         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1584         if mobj is None:
1585             raise ExtractorError(u'Invalid URL: %s' % url)
1586
1587         # Download playlist videos from API
1588         playlist_id = mobj.group(1) or mobj.group(2)
1589         page_num = 1
1590         videos = []
1591
1592         while True:
1593             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1594             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1595
1596             try:
1597                 response = json.loads(page)
1598             except ValueError as err:
1599                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1600
1601             if 'feed' not in response:
1602                 raise ExtractorError(u'Got a malformed response from YouTube API')
1603             playlist_title = response['feed']['title']['$t']
1604             if 'entry' not in response['feed']:
1605                 # Number of videos is a multiple of self._MAX_RESULTS
1606                 break
1607
1608             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1609                         for entry in response['feed']['entry']
1610                         if 'content' in entry ]
1611
1612             if len(response['feed']['entry']) < self._MAX_RESULTS:
1613                 break
1614             page_num += 1
1615
1616         videos = [v[1] for v in sorted(videos)]
1617
1618         url_results = [self.url_result(url, 'Youtube') for url in videos]
1619         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1620
1621
1622 class YoutubeChannelIE(InfoExtractor):
1623     """Information Extractor for YouTube channels."""
1624
1625     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1626     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1627     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1628     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1629     IE_NAME = u'youtube:channel'
1630
1631     def extract_videos_from_page(self, page):
1632         ids_in_page = []
1633         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1634             if mobj.group(1) not in ids_in_page:
1635                 ids_in_page.append(mobj.group(1))
1636         return ids_in_page
1637
1638     def _real_extract(self, url):
1639         # Extract channel id
1640         mobj = re.match(self._VALID_URL, url)
1641         if mobj is None:
1642             raise ExtractorError(u'Invalid URL: %s' % url)
1643
1644         # Download channel page
1645         channel_id = mobj.group(1)
1646         video_ids = []
1647         pagenum = 1
1648
1649         url = self._TEMPLATE_URL % (channel_id, pagenum)
1650         page = self._download_webpage(url, channel_id,
1651                                       u'Downloading page #%s' % pagenum)
1652
1653         # Extract video identifiers
1654         ids_in_page = self.extract_videos_from_page(page)
1655         video_ids.extend(ids_in_page)
1656
1657         # Download any subsequent channel pages using the json-based channel_ajax query
1658         if self._MORE_PAGES_INDICATOR in page:
1659             while True:
1660                 pagenum = pagenum + 1
1661
1662                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1663                 page = self._download_webpage(url, channel_id,
1664                                               u'Downloading page #%s' % pagenum)
1665
1666                 page = json.loads(page)
1667
1668                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1669                 video_ids.extend(ids_in_page)
1670
1671                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1672                     break
1673
1674         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1675
1676         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1677         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1678         return [self.playlist_result(url_entries, channel_id)]
1679
1680
1681 class YoutubeUserIE(InfoExtractor):
1682     """Information Extractor for YouTube users."""
1683
1684     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1685     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1686     _GDATA_PAGE_SIZE = 50
1687     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1688     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1689     IE_NAME = u'youtube:user'
1690
1691     def _real_extract(self, url):
1692         # Extract username
1693         mobj = re.match(self._VALID_URL, url)
1694         if mobj is None:
1695             raise ExtractorError(u'Invalid URL: %s' % url)
1696
1697         username = mobj.group(1)
1698
1699         # Download video ids using YouTube Data API. Result size per
1700         # query is limited (currently to 50 videos) so we need to query
1701         # page by page until there are no video ids - it means we got
1702         # all of them.
1703
1704         video_ids = []
1705         pagenum = 0
1706
1707         while True:
1708             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1709
1710             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1711             page = self._download_webpage(gdata_url, username,
1712                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1713
1714             # Extract video identifiers
1715             ids_in_page = []
1716
1717             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718                 if mobj.group(1) not in ids_in_page:
1719                     ids_in_page.append(mobj.group(1))
1720
1721             video_ids.extend(ids_in_page)
1722
1723             # A little optimization - if current page is not
1724             # "full", ie. does not contain PAGE_SIZE video ids then
1725             # we can assume that this page is the last one - there
1726             # are no more ids on further pages - no need to query
1727             # again.
1728
1729             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1730                 break
1731
1732             pagenum += 1
1733
1734         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1735         url_results = [self.url_result(url, 'Youtube') for url in urls]
1736         return [self.playlist_result(url_results, playlist_title = username)]
1737
1738
1739 class BlipTVUserIE(InfoExtractor):
1740     """Information Extractor for blip.tv users."""
1741
1742     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1743     _PAGE_SIZE = 12
1744     IE_NAME = u'blip.tv:user'
1745
1746     def _real_extract(self, url):
1747         # Extract username
1748         mobj = re.match(self._VALID_URL, url)
1749         if mobj is None:
1750             raise ExtractorError(u'Invalid URL: %s' % url)
1751
1752         username = mobj.group(1)
1753
1754         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1755
1756         page = self._download_webpage(url, username, u'Downloading user page')
1757         mobj = re.search(r'data-users-id="([^"]+)"', page)
1758         page_base = page_base % mobj.group(1)
1759
1760
1761         # Download video ids using BlipTV Ajax calls. Result size per
1762         # query is limited (currently to 12 videos) so we need to query
1763         # page by page until there are no video ids - it means we got
1764         # all of them.
1765
1766         video_ids = []
1767         pagenum = 1
1768
1769         while True:
1770             url = page_base + "&page=" + str(pagenum)
1771             page = self._download_webpage(url, username,
1772                                           u'Downloading video ids from page %d' % pagenum)
1773
1774             # Extract video identifiers
1775             ids_in_page = []
1776
1777             for mobj in re.finditer(r'href="/([^"]+)"', page):
1778                 if mobj.group(1) not in ids_in_page:
1779                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1780
1781             video_ids.extend(ids_in_page)
1782
1783             # A little optimization - if current page is not
1784             # "full", ie. does not contain PAGE_SIZE video ids then
1785             # we can assume that this page is the last one - there
1786             # are no more ids on further pages - no need to query
1787             # again.
1788
1789             if len(ids_in_page) < self._PAGE_SIZE:
1790                 break
1791
1792             pagenum += 1
1793
1794         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1795         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1796         return [self.playlist_result(url_entries, playlist_title = username)]
1797
1798
1799 class DepositFilesIE(InfoExtractor):
1800     """Information extractor for depositfiles.com"""
1801
1802     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1803
1804     def _real_extract(self, url):
1805         file_id = url.split('/')[-1]
1806         # Rebuild url in english locale
1807         url = 'http://depositfiles.com/en/files/' + file_id
1808
1809         # Retrieve file webpage with 'Free download' button pressed
1810         free_download_indication = { 'gateway_result' : '1' }
1811         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1812         try:
1813             self.report_download_webpage(file_id)
1814             webpage = compat_urllib_request.urlopen(request).read()
1815         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1816             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1817
1818         # Search for the real file URL
1819         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1820         if (mobj is None) or (mobj.group(1) is None):
1821             # Try to figure out reason of the error.
1822             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1823             if (mobj is not None) and (mobj.group(1) is not None):
1824                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1825                 raise ExtractorError(u'%s' % restriction_message)
1826             else:
1827                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1828
1829         file_url = mobj.group(1)
1830         file_extension = os.path.splitext(file_url)[1][1:]
1831
1832         # Search for file title
1833         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1834
1835         return [{
1836             'id':       file_id.decode('utf-8'),
1837             'url':      file_url.decode('utf-8'),
1838             'uploader': None,
1839             'upload_date':  None,
1840             'title':    file_title,
1841             'ext':      file_extension.decode('utf-8'),
1842         }]
1843
1844
1845 class FacebookIE(InfoExtractor):
1846     """Information Extractor for Facebook"""
1847
1848     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1849     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1850     _NETRC_MACHINE = 'facebook'
1851     IE_NAME = u'facebook'
1852
1853     def report_login(self):
1854         """Report attempt to log in."""
1855         self.to_screen(u'Logging in')
1856
1857     def _real_initialize(self):
1858         if self._downloader is None:
1859             return
1860
1861         useremail = None
1862         password = None
1863         downloader_params = self._downloader.params
1864
1865         # Attempt to use provided username and password or .netrc data
1866         if downloader_params.get('username', None) is not None:
1867             useremail = downloader_params['username']
1868             password = downloader_params['password']
1869         elif downloader_params.get('usenetrc', False):
1870             try:
1871                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1872                 if info is not None:
1873                     useremail = info[0]
1874                     password = info[2]
1875                 else:
1876                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1877             except (IOError, netrc.NetrcParseError) as err:
1878                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1879                 return
1880
1881         if useremail is None:
1882             return
1883
1884         # Log in
1885         login_form = {
1886             'email': useremail,
1887             'pass': password,
1888             'login': 'Log+In'
1889             }
1890         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1891         try:
1892             self.report_login()
1893             login_results = compat_urllib_request.urlopen(request).read()
1894             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1895                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1896                 return
1897         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1898             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1899             return
1900
1901     def _real_extract(self, url):
1902         mobj = re.match(self._VALID_URL, url)
1903         if mobj is None:
1904             raise ExtractorError(u'Invalid URL: %s' % url)
1905         video_id = mobj.group('ID')
1906
1907         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1908         webpage = self._download_webpage(url, video_id)
1909
1910         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1911         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1912         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1913         if not m:
1914             raise ExtractorError(u'Cannot parse data')
1915         data = dict(json.loads(m.group(1)))
1916         params_raw = compat_urllib_parse.unquote(data['params'])
1917         params = json.loads(params_raw)
1918         video_data = params['video_data'][0]
1919         video_url = video_data.get('hd_src')
1920         if not video_url:
1921             video_url = video_data['sd_src']
1922         if not video_url:
1923             raise ExtractorError(u'Cannot find video URL')
1924         video_duration = int(video_data['video_duration'])
1925         thumbnail = video_data['thumbnail_src']
1926
1927         video_title = self._search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1928             webpage, u'title')
1929         video_title = unescapeHTML(video_title)
1930
1931         info = {
1932             'id': video_id,
1933             'title': video_title,
1934             'url': video_url,
1935             'ext': 'mp4',
1936             'duration': video_duration,
1937             'thumbnail': thumbnail,
1938         }
1939         return [info]
1940
1941
1942 class BlipTVIE(InfoExtractor):
1943     """Information extractor for blip.tv"""
1944
1945     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1946     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1947     IE_NAME = u'blip.tv'
1948
1949     def report_direct_download(self, title):
1950         """Report information extraction."""
1951         self.to_screen(u'%s: Direct download detected' % title)
1952
1953     def _real_extract(self, url):
1954         mobj = re.match(self._VALID_URL, url)
1955         if mobj is None:
1956             raise ExtractorError(u'Invalid URL: %s' % url)
1957
1958         # See https://github.com/rg3/youtube-dl/issues/857
1959         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1960         if api_mobj is not None:
1961             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1962         urlp = compat_urllib_parse_urlparse(url)
1963         if urlp.path.startswith('/play/'):
1964             request = compat_urllib_request.Request(url)
1965             response = compat_urllib_request.urlopen(request)
1966             redirecturl = response.geturl()
1967             rurlp = compat_urllib_parse_urlparse(redirecturl)
1968             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1969             url = 'http://blip.tv/a/a-' + file_id
1970             return self._real_extract(url)
1971
1972
1973         if '?' in url:
1974             cchar = '&'
1975         else:
1976             cchar = '?'
1977         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1978         request = compat_urllib_request.Request(json_url)
1979         request.add_header('User-Agent', 'iTunes/10.6.1')
1980         self.report_extraction(mobj.group(1))
1981         info = None
1982         try:
1983             urlh = compat_urllib_request.urlopen(request)
1984             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1985                 basename = url.split('/')[-1]
1986                 title,ext = os.path.splitext(basename)
1987                 title = title.decode('UTF-8')
1988                 ext = ext.replace('.', '')
1989                 self.report_direct_download(title)
1990                 info = {
1991                     'id': title,
1992                     'url': url,
1993                     'uploader': None,
1994                     'upload_date': None,
1995                     'title': title,
1996                     'ext': ext,
1997                     'urlhandle': urlh
1998                 }
1999         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2000             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2001         if info is None: # Regular URL
2002             try:
2003                 json_code_bytes = urlh.read()
2004                 json_code = json_code_bytes.decode('utf-8')
2005             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2006                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2007
2008             try:
2009                 json_data = json.loads(json_code)
2010                 if 'Post' in json_data:
2011                     data = json_data['Post']
2012                 else:
2013                     data = json_data
2014
2015                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2016                 video_url = data['media']['url']
2017                 umobj = re.match(self._URL_EXT, video_url)
2018                 if umobj is None:
2019                     raise ValueError('Can not determine filename extension')
2020                 ext = umobj.group(1)
2021
2022                 info = {
2023                     'id': data['item_id'],
2024                     'url': video_url,
2025                     'uploader': data['display_name'],
2026                     'upload_date': upload_date,
2027                     'title': data['title'],
2028                     'ext': ext,
2029                     'format': data['media']['mimeType'],
2030                     'thumbnail': data['thumbnailUrl'],
2031                     'description': data['description'],
2032                     'player_url': data['embedUrl'],
2033                     'user_agent': 'iTunes/10.6.1',
2034                 }
2035             except (ValueError,KeyError) as err:
2036                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2037
2038         return [info]
2039
2040
2041 class MyVideoIE(InfoExtractor):
2042     """Information Extractor for myvideo.de."""
2043
2044     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2045     IE_NAME = u'myvideo'
2046
2047     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2048     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2049     # https://github.com/rg3/youtube-dl/pull/842
2050     def __rc4crypt(self,data, key):
2051         x = 0
2052         box = list(range(256))
2053         for i in list(range(256)):
2054             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2055             box[i], box[x] = box[x], box[i]
2056         x = 0
2057         y = 0
2058         out = ''
2059         for char in data:
2060             x = (x + 1) % 256
2061             y = (y + box[x]) % 256
2062             box[x], box[y] = box[y], box[x]
2063             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2064         return out
2065
2066     def __md5(self,s):
2067         return hashlib.md5(s).hexdigest().encode()
2068
2069     def _real_extract(self,url):
2070         mobj = re.match(self._VALID_URL, url)
2071         if mobj is None:
2072             raise ExtractorError(u'invalid URL: %s' % url)
2073
2074         video_id = mobj.group(1)
2075
2076         GK = (
2077           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2078           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2079           b'TnpsbA0KTVRkbU1tSTRNdz09'
2080         )
2081
2082         # Get video webpage
2083         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2084         webpage = self._download_webpage(webpage_url, video_id)
2085
2086         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2087         if mobj is not None:
2088             self.report_extraction(video_id)
2089             video_url = mobj.group(1) + '.flv'
2090
2091             video_title = self._search_regex('<title>([^<]+)</title>',
2092                 webpage, u'title')
2093
2094             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2095
2096             return [{
2097                 'id':       video_id,
2098                 'url':      video_url,
2099                 'uploader': None,
2100                 'upload_date':  None,
2101                 'title':    video_title,
2102                 'ext':      u'flv',
2103             }]
2104
2105         # try encxml
2106         mobj = re.search('var flashvars={(.+?)}', webpage)
2107         if mobj is None:
2108             raise ExtractorError(u'Unable to extract video')
2109
2110         params = {}
2111         encxml = ''
2112         sec = mobj.group(1)
2113         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2114             if not a == '_encxml':
2115                 params[a] = b
2116             else:
2117                 encxml = compat_urllib_parse.unquote(b)
2118         if not params.get('domain'):
2119             params['domain'] = 'www.myvideo.de'
2120         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2121         if 'flash_playertype=MTV' in xmldata_url:
2122             self._downloader.report_warning(u'avoiding MTV player')
2123             xmldata_url = (
2124                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2125                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2126             ) % video_id
2127
2128         # get enc data
2129         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2130         enc_data_b = binascii.unhexlify(enc_data)
2131         sk = self.__md5(
2132             base64.b64decode(base64.b64decode(GK)) +
2133             self.__md5(
2134                 str(video_id).encode('utf-8')
2135             )
2136         )
2137         dec_data = self.__rc4crypt(enc_data_b, sk)
2138
2139         # extracting infos
2140         self.report_extraction(video_id)
2141
2142         video_url = None
2143         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2144         if mobj:
2145             video_url = compat_urllib_parse.unquote(mobj.group(1))
2146             if 'myvideo2flash' in video_url:
2147                 self._downloader.report_warning(u'forcing RTMPT ...')
2148                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2149
2150         if not video_url:
2151             # extract non rtmp videos
2152             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2153             if mobj is None:
2154                 raise ExtractorError(u'unable to extract url')
2155             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2156
2157         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2158         video_file = compat_urllib_parse.unquote(video_file)
2159
2160         if not video_file.endswith('f4m'):
2161             ppath, prefix = video_file.split('.')
2162             video_playpath = '%s:%s' % (prefix, ppath)
2163             video_hls_playlist = ''
2164         else:
2165             video_playpath = ''
2166             video_hls_playlist = (
2167                 video_filepath + video_file
2168             ).replace('.f4m', '.m3u8')
2169
2170         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2171         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2172
2173         video_title = self._search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2174             webpage, u'title')
2175
2176         return [{
2177             'id':                 video_id,
2178             'url':                video_url,
2179             'tc_url':             video_url,
2180             'uploader':           None,
2181             'upload_date':        None,
2182             'title':              video_title,
2183             'ext':                u'flv',
2184             'play_path':          video_playpath,
2185             'video_file':         video_file,
2186             'video_hls_playlist': video_hls_playlist,
2187             'player_url':         video_swfobj,
2188         }]
2189
2190
2191 class ComedyCentralIE(InfoExtractor):
2192     """Information extractor for The Daily Show and Colbert Report """
2193
2194     # urls can be abbreviations like :thedailyshow or :colbert
2195     # urls for episodes like:
2196     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2197     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2198     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2199     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2200                       |(https?://)?(www\.)?
2201                           (?P<showname>thedailyshow|colbertnation)\.com/
2202                          (full-episodes/(?P<episode>.*)|
2203                           (?P<clip>
2204                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2205                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2206                      $"""
2207
2208     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2209
2210     _video_extensions = {
2211         '3500': 'mp4',
2212         '2200': 'mp4',
2213         '1700': 'mp4',
2214         '1200': 'mp4',
2215         '750': 'mp4',
2216         '400': 'mp4',
2217     }
2218     _video_dimensions = {
2219         '3500': '1280x720',
2220         '2200': '960x540',
2221         '1700': '768x432',
2222         '1200': '640x360',
2223         '750': '512x288',
2224         '400': '384x216',
2225     }
2226
2227     @classmethod
2228     def suitable(cls, url):
2229         """Receives a URL and returns True if suitable for this IE."""
2230         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2231
2232     def _print_formats(self, formats):
2233         print('Available formats:')
2234         for x in formats:
2235             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2236
2237
2238     def _real_extract(self, url):
2239         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2240         if mobj is None:
2241             raise ExtractorError(u'Invalid URL: %s' % url)
2242
2243         if mobj.group('shortname'):
2244             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2245                 url = u'http://www.thedailyshow.com/full-episodes/'
2246             else:
2247                 url = u'http://www.colbertnation.com/full-episodes/'
2248             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2249             assert mobj is not None
2250
2251         if mobj.group('clip'):
2252             if mobj.group('showname') == 'thedailyshow':
2253                 epTitle = mobj.group('tdstitle')
2254             else:
2255                 epTitle = mobj.group('cntitle')
2256             dlNewest = False
2257         else:
2258             dlNewest = not mobj.group('episode')
2259             if dlNewest:
2260                 epTitle = mobj.group('showname')
2261             else:
2262                 epTitle = mobj.group('episode')
2263
2264         self.report_extraction(epTitle)
2265         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2266         if dlNewest:
2267             url = htmlHandle.geturl()
2268             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2269             if mobj is None:
2270                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2271             if mobj.group('episode') == '':
2272                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2273             epTitle = mobj.group('episode')
2274
2275         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2276
2277         if len(mMovieParams) == 0:
2278             # The Colbert Report embeds the information in a without
2279             # a URL prefix; so extract the alternate reference
2280             # and then add the URL prefix manually.
2281
2282             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2283             if len(altMovieParams) == 0:
2284                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2285             else:
2286                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2287
2288         uri = mMovieParams[0][1]
2289         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2290         indexXml = self._download_webpage(indexUrl, epTitle,
2291                                           u'Downloading show index',
2292                                           u'unable to download episode index')
2293
2294         results = []
2295
2296         idoc = xml.etree.ElementTree.fromstring(indexXml)
2297         itemEls = idoc.findall('.//item')
2298         for partNum,itemEl in enumerate(itemEls):
2299             mediaId = itemEl.findall('./guid')[0].text
2300             shortMediaId = mediaId.split(':')[-1]
2301             showId = mediaId.split(':')[-2].replace('.com', '')
2302             officialTitle = itemEl.findall('./title')[0].text
2303             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2304
2305             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2306                         compat_urllib_parse.urlencode({'uri': mediaId}))
2307             configXml = self._download_webpage(configUrl, epTitle,
2308                                                u'Downloading configuration for %s' % shortMediaId)
2309
2310             cdoc = xml.etree.ElementTree.fromstring(configXml)
2311             turls = []
2312             for rendition in cdoc.findall('.//rendition'):
2313                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2314                 turls.append(finfo)
2315
2316             if len(turls) == 0:
2317                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2318                 continue
2319
2320             if self._downloader.params.get('listformats', None):
2321                 self._print_formats([i[0] for i in turls])
2322                 return
2323
2324             # For now, just pick the highest bitrate
2325             format,rtmp_video_url = turls[-1]
2326
2327             # Get the format arg from the arg stream
2328             req_format = self._downloader.params.get('format', None)
2329
2330             # Select format if we can find one
2331             for f,v in turls:
2332                 if f == req_format:
2333                     format, rtmp_video_url = f, v
2334                     break
2335
2336             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2337             if not m:
2338                 raise ExtractorError(u'Cannot transform RTMP url')
2339             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2340             video_url = base + m.group('finalid')
2341
2342             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2343             info = {
2344                 'id': shortMediaId,
2345                 'url': video_url,
2346                 'uploader': showId,
2347                 'upload_date': officialDate,
2348                 'title': effTitle,
2349                 'ext': 'mp4',
2350                 'format': format,
2351                 'thumbnail': None,
2352                 'description': officialTitle,
2353             }
2354             results.append(info)
2355
2356         return results
2357
2358
2359 class EscapistIE(InfoExtractor):
2360     """Information extractor for The Escapist """
2361
2362     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2363     IE_NAME = u'escapist'
2364
2365     def _real_extract(self, url):
2366         mobj = re.match(self._VALID_URL, url)
2367         if mobj is None:
2368             raise ExtractorError(u'Invalid URL: %s' % url)
2369         showName = mobj.group('showname')
2370         videoId = mobj.group('episode')
2371
2372         self.report_extraction(showName)
2373         webpage = self._download_webpage(url, showName)
2374
2375         videoDesc = self._search_regex('<meta name="description" content="([^"]*)"',
2376             webpage, u'description', fatal=False)
2377         if videoDesc: videoDesc = unescapeHTML(videoDesc)
2378
2379         imgUrl = self._search_regex('<meta property="og:image" content="([^"]*)"',
2380             webpage, u'thumbnail', fatal=False)
2381         if imgUrl: imgUrl = unescapeHTML(imgUrl)
2382
2383         playerUrl = self._search_regex('<meta property="og:video" content="([^"]*)"',
2384             webpage, u'player url')
2385         playerUrl = unescapeHTML(playerUrl)
2386
2387         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2388         configUrl = compat_urllib_parse.unquote(configUrl)
2389
2390         configJSON = self._download_webpage(configUrl, showName,
2391                                             u'Downloading configuration',
2392                                             u'unable to download configuration')
2393
2394         # Technically, it's JavaScript, not JSON
2395         configJSON = configJSON.replace("'", '"')
2396
2397         try:
2398             config = json.loads(configJSON)
2399         except (ValueError,) as err:
2400             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2401
2402         playlist = config['playlist']
2403         videoUrl = playlist[1]['url']
2404
2405         info = {
2406             'id': videoId,
2407             'url': videoUrl,
2408             'uploader': showName,
2409             'upload_date': None,
2410             'title': showName,
2411             'ext': 'mp4',
2412             'thumbnail': imgUrl,
2413             'description': videoDesc,
2414             'player_url': playerUrl,
2415         }
2416
2417         return [info]
2418
2419 class CollegeHumorIE(InfoExtractor):
2420     """Information extractor for collegehumor.com"""
2421
2422     _WORKING = False
2423     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2424     IE_NAME = u'collegehumor'
2425
2426     def report_manifest(self, video_id):
2427         """Report information extraction."""
2428         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2429
2430     def _real_extract(self, url):
2431         mobj = re.match(self._VALID_URL, url)
2432         if mobj is None:
2433             raise ExtractorError(u'Invalid URL: %s' % url)
2434         video_id = mobj.group('videoid')
2435
2436         info = {
2437             'id': video_id,
2438             'uploader': None,
2439             'upload_date': None,
2440         }
2441
2442         self.report_extraction(video_id)
2443         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2444         try:
2445             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2446         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2447             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2448
2449         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2450         try:
2451             videoNode = mdoc.findall('./video')[0]
2452             info['description'] = videoNode.findall('./description')[0].text
2453             info['title'] = videoNode.findall('./caption')[0].text
2454             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2455             manifest_url = videoNode.findall('./file')[0].text
2456         except IndexError:
2457             raise ExtractorError(u'Invalid metadata XML file')
2458
2459         manifest_url += '?hdcore=2.10.3'
2460         self.report_manifest(video_id)
2461         try:
2462             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2463         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2464             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2465
2466         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2467         try:
2468             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2469             node_id = media_node.attrib['url']
2470             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2471         except IndexError as err:
2472             raise ExtractorError(u'Invalid manifest file')
2473
2474         url_pr = compat_urllib_parse_urlparse(manifest_url)
2475         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2476
2477         info['url'] = url
2478         info['ext'] = 'f4f'
2479         return [info]
2480
2481
2482 class XVideosIE(InfoExtractor):
2483     """Information extractor for xvideos.com"""
2484
2485     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2486     IE_NAME = u'xvideos'
2487
2488     def _real_extract(self, url):
2489         mobj = re.match(self._VALID_URL, url)
2490         if mobj is None:
2491             raise ExtractorError(u'Invalid URL: %s' % url)
2492         video_id = mobj.group(1)
2493
2494         webpage = self._download_webpage(url, video_id)
2495
2496         self.report_extraction(video_id)
2497
2498         # Extract video URL
2499         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2500             webpage, u'video URL'))
2501
2502         # Extract title
2503         video_title = self._search_regex(r'<title>(.*?)\s+-\s+XVID',
2504             webpage, u'title')
2505
2506         # Extract video thumbnail
2507         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2508             webpage, u'thumbnail', fatal=False)
2509
2510         info = {
2511             'id': video_id,
2512             'url': video_url,
2513             'uploader': None,
2514             'upload_date': None,
2515             'title': video_title,
2516             'ext': 'flv',
2517             'thumbnail': video_thumbnail,
2518             'description': None,
2519         }
2520
2521         return [info]
2522
2523
2524 class SoundcloudIE(InfoExtractor):
2525     """Information extractor for soundcloud.com
2526        To access the media, the uid of the song and a stream token
2527        must be extracted from the page source and the script must make
2528        a request to media.soundcloud.com/crossdomain.xml. Then
2529        the media can be grabbed by requesting from an url composed
2530        of the stream token and uid
2531      """
2532
2533     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2534     IE_NAME = u'soundcloud'
2535
2536     def report_resolve(self, video_id):
2537         """Report information extraction."""
2538         self.to_screen(u'%s: Resolving id' % video_id)
2539
2540     def _real_extract(self, url):
2541         mobj = re.match(self._VALID_URL, url)
2542         if mobj is None:
2543             raise ExtractorError(u'Invalid URL: %s' % url)
2544
2545         # extract uploader (which is in the url)
2546         uploader = mobj.group(1)
2547         # extract simple title (uploader + slug of song title)
2548         slug_title =  mobj.group(2)
2549         simple_title = uploader + u'-' + slug_title
2550         full_title = '%s/%s' % (uploader, slug_title)
2551
2552         self.report_resolve(full_title)
2553
2554         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2555         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2556         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2557
2558         info = json.loads(info_json)
2559         video_id = info['id']
2560         self.report_extraction(full_title)
2561
2562         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2563         stream_json = self._download_webpage(streams_url, full_title,
2564                                              u'Downloading stream definitions',
2565                                              u'unable to download stream definitions')
2566
2567         streams = json.loads(stream_json)
2568         mediaURL = streams['http_mp3_128_url']
2569         upload_date = unified_strdate(info['created_at'])
2570
2571         return [{
2572             'id':       info['id'],
2573             'url':      mediaURL,
2574             'uploader': info['user']['username'],
2575             'upload_date': upload_date,
2576             'title':    info['title'],
2577             'ext':      u'mp3',
2578             'description': info['description'],
2579         }]
2580
2581 class SoundcloudSetIE(InfoExtractor):
2582     """Information extractor for soundcloud.com sets
2583        To access the media, the uid of the song and a stream token
2584        must be extracted from the page source and the script must make
2585        a request to media.soundcloud.com/crossdomain.xml. Then
2586        the media can be grabbed by requesting from an url composed
2587        of the stream token and uid
2588      """
2589
2590     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2591     IE_NAME = u'soundcloud:set'
2592
2593     def report_resolve(self, video_id):
2594         """Report information extraction."""
2595         self.to_screen(u'%s: Resolving id' % video_id)
2596
2597     def _real_extract(self, url):
2598         mobj = re.match(self._VALID_URL, url)
2599         if mobj is None:
2600             raise ExtractorError(u'Invalid URL: %s' % url)
2601
2602         # extract uploader (which is in the url)
2603         uploader = mobj.group(1)
2604         # extract simple title (uploader + slug of song title)
2605         slug_title =  mobj.group(2)
2606         simple_title = uploader + u'-' + slug_title
2607         full_title = '%s/sets/%s' % (uploader, slug_title)
2608
2609         self.report_resolve(full_title)
2610
2611         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2612         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2613         info_json = self._download_webpage(resolv_url, full_title)
2614
2615         videos = []
2616         info = json.loads(info_json)
2617         if 'errors' in info:
2618             for err in info['errors']:
2619                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2620             return
2621
2622         self.report_extraction(full_title)
2623         for track in info['tracks']:
2624             video_id = track['id']
2625
2626             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2627             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2628
2629             self.report_extraction(video_id)
2630             streams = json.loads(stream_json)
2631             mediaURL = streams['http_mp3_128_url']
2632
2633             videos.append({
2634                 'id':       video_id,
2635                 'url':      mediaURL,
2636                 'uploader': track['user']['username'],
2637                 'upload_date':  unified_strdate(track['created_at']),
2638                 'title':    track['title'],
2639                 'ext':      u'mp3',
2640                 'description': track['description'],
2641             })
2642         return videos
2643
2644
2645 class InfoQIE(InfoExtractor):
2646     """Information extractor for infoq.com"""
2647     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2648
2649     def _real_extract(self, url):
2650         mobj = re.match(self._VALID_URL, url)
2651         if mobj is None:
2652             raise ExtractorError(u'Invalid URL: %s' % url)
2653
2654         webpage = self._download_webpage(url, video_id=url)
2655         self.report_extraction(url)
2656
2657         # Extract video URL
2658         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2659         if mobj is None:
2660             raise ExtractorError(u'Unable to extract video url')
2661         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2662         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2663
2664         # Extract title
2665         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2666             webpage, u'title')
2667
2668         # Extract description
2669         video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2670             webpage, u'description', fatal=False)
2671
2672         video_filename = video_url.split('/')[-1]
2673         video_id, extension = video_filename.split('.')
2674
2675         info = {
2676             'id': video_id,
2677             'url': video_url,
2678             'uploader': None,
2679             'upload_date': None,
2680             'title': video_title,
2681             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2682             'thumbnail': None,
2683             'description': video_description,
2684         }
2685
2686         return [info]
2687
2688 class MixcloudIE(InfoExtractor):
2689     """Information extractor for www.mixcloud.com"""
2690
2691     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2692     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2693     IE_NAME = u'mixcloud'
2694
2695     def report_download_json(self, file_id):
2696         """Report JSON download."""
2697         self.to_screen(u'Downloading json')
2698
2699     def get_urls(self, jsonData, fmt, bitrate='best'):
2700         """Get urls from 'audio_formats' section in json"""
2701         file_url = None
2702         try:
2703             bitrate_list = jsonData[fmt]
2704             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2705                 bitrate = max(bitrate_list) # select highest
2706
2707             url_list = jsonData[fmt][bitrate]
2708         except TypeError: # we have no bitrate info.
2709             url_list = jsonData[fmt]
2710         return url_list
2711
2712     def check_urls(self, url_list):
2713         """Returns 1st active url from list"""
2714         for url in url_list:
2715             try:
2716                 compat_urllib_request.urlopen(url)
2717                 return url
2718             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2719                 url = None
2720
2721         return None
2722
2723     def _print_formats(self, formats):
2724         print('Available formats:')
2725         for fmt in formats.keys():
2726             for b in formats[fmt]:
2727                 try:
2728                     ext = formats[fmt][b][0]
2729                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2730                 except TypeError: # we have no bitrate info
2731                     ext = formats[fmt][0]
2732                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2733                     break
2734
2735     def _real_extract(self, url):
2736         mobj = re.match(self._VALID_URL, url)
2737         if mobj is None:
2738             raise ExtractorError(u'Invalid URL: %s' % url)
2739         # extract uploader & filename from url
2740         uploader = mobj.group(1).decode('utf-8')
2741         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2742
2743         # construct API request
2744         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2745         # retrieve .json file with links to files
2746         request = compat_urllib_request.Request(file_url)
2747         try:
2748             self.report_download_json(file_url)
2749             jsonData = compat_urllib_request.urlopen(request).read()
2750         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2751             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2752
2753         # parse JSON
2754         json_data = json.loads(jsonData)
2755         player_url = json_data['player_swf_url']
2756         formats = dict(json_data['audio_formats'])
2757
2758         req_format = self._downloader.params.get('format', None)
2759         bitrate = None
2760
2761         if self._downloader.params.get('listformats', None):
2762             self._print_formats(formats)
2763             return
2764
2765         if req_format is None or req_format == 'best':
2766             for format_param in formats.keys():
2767                 url_list = self.get_urls(formats, format_param)
2768                 # check urls
2769                 file_url = self.check_urls(url_list)
2770                 if file_url is not None:
2771                     break # got it!
2772         else:
2773             if req_format not in formats:
2774                 raise ExtractorError(u'Format is not available')
2775
2776             url_list = self.get_urls(formats, req_format)
2777             file_url = self.check_urls(url_list)
2778             format_param = req_format
2779
2780         return [{
2781             'id': file_id.decode('utf-8'),
2782             'url': file_url.decode('utf-8'),
2783             'uploader': uploader.decode('utf-8'),
2784             'upload_date': None,
2785             'title': json_data['name'],
2786             'ext': file_url.split('.')[-1].decode('utf-8'),
2787             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2788             'thumbnail': json_data['thumbnail_url'],
2789             'description': json_data['description'],
2790             'player_url': player_url.decode('utf-8'),
2791         }]
2792
2793 class StanfordOpenClassroomIE(InfoExtractor):
2794     """Information extractor for Stanford's Open ClassRoom"""
2795
2796     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2797     IE_NAME = u'stanfordoc'
2798
2799     def _real_extract(self, url):
2800         mobj = re.match(self._VALID_URL, url)
2801         if mobj is None:
2802             raise ExtractorError(u'Invalid URL: %s' % url)
2803
2804         if mobj.group('course') and mobj.group('video'): # A specific video
2805             course = mobj.group('course')
2806             video = mobj.group('video')
2807             info = {
2808                 'id': course + '_' + video,
2809                 'uploader': None,
2810                 'upload_date': None,
2811             }
2812
2813             self.report_extraction(info['id'])
2814             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2815             xmlUrl = baseUrl + video + '.xml'
2816             try:
2817                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2818             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2819                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2820             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2821             try:
2822                 info['title'] = mdoc.findall('./title')[0].text
2823                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2824             except IndexError:
2825                 raise ExtractorError(u'Invalid metadata XML file')
2826             info['ext'] = info['url'].rpartition('.')[2]
2827             return [info]
2828         elif mobj.group('course'): # A course page
2829             course = mobj.group('course')
2830             info = {
2831                 'id': course,
2832                 'type': 'playlist',
2833                 'uploader': None,
2834                 'upload_date': None,
2835             }
2836
2837             coursepage = self._download_webpage(url, info['id'],
2838                                         note='Downloading course info page',
2839                                         errnote='Unable to download course info page')
2840
2841             info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2842             info['title'] = unescapeHTML(info['title'])
2843
2844             info['description'] = self._search_regex('<description>([^<]+)</description>',
2845                 coursepage, u'description', fatal=False)
2846             if info['description']: info['description'] = unescapeHTML(info['description'])
2847
2848             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2849             info['list'] = [
2850                 {
2851                     'type': 'reference',
2852                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2853                 }
2854                     for vpage in links]
2855             results = []
2856             for entry in info['list']:
2857                 assert entry['type'] == 'reference'
2858                 results += self.extract(entry['url'])
2859             return results
2860         else: # Root page
2861             info = {
2862                 'id': 'Stanford OpenClassroom',
2863                 'type': 'playlist',
2864                 'uploader': None,
2865                 'upload_date': None,
2866             }
2867
2868             self.report_download_webpage(info['id'])
2869             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2870             try:
2871                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2872             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2873                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2874
2875             info['title'] = info['id']
2876
2877             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2878             info['list'] = [
2879                 {
2880                     'type': 'reference',
2881                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2882                 }
2883                     for cpage in links]
2884
2885             results = []
2886             for entry in info['list']:
2887                 assert entry['type'] == 'reference'
2888                 results += self.extract(entry['url'])
2889             return results
2890
2891 class MTVIE(InfoExtractor):
2892     """Information extractor for MTV.com"""
2893
2894     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2895     IE_NAME = u'mtv'
2896
2897     def _real_extract(self, url):
2898         mobj = re.match(self._VALID_URL, url)
2899         if mobj is None:
2900             raise ExtractorError(u'Invalid URL: %s' % url)
2901         if not mobj.group('proto'):
2902             url = 'http://' + url
2903         video_id = mobj.group('videoid')
2904
2905         webpage = self._download_webpage(url, video_id)
2906
2907         song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2908             webpage, u'song name', fatal=False)
2909         if song_name: song_name = unescapeHTML(song_name)
2910
2911         video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2912             webpage, u'title')
2913         video_title = unescapeHTML(video_title)
2914
2915         mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2916             webpage, u'mtvn_uri', fatal=False)
2917
2918         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2919             webpage, u'content id', fatal=False)
2920
2921         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2922         self.report_extraction(video_id)
2923         request = compat_urllib_request.Request(videogen_url)
2924         try:
2925             metadataXml = compat_urllib_request.urlopen(request).read()
2926         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2927             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2928
2929         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2930         renditions = mdoc.findall('.//rendition')
2931
2932         # For now, always pick the highest quality.
2933         rendition = renditions[-1]
2934
2935         try:
2936             _,_,ext = rendition.attrib['type'].partition('/')
2937             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2938             video_url = rendition.find('./src').text
2939         except KeyError:
2940             raise ExtractorError('Invalid rendition field.')
2941
2942         info = {
2943             'id': video_id,
2944             'url': video_url,
2945             'uploader': performer,
2946             'upload_date': None,
2947             'title': video_title,
2948             'ext': ext,
2949             'format': format,
2950         }
2951
2952         return [info]
2953
2954
2955 class YoukuIE(InfoExtractor):
2956     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2957
2958     def _gen_sid(self):
2959         nowTime = int(time.time() * 1000)
2960         random1 = random.randint(1000,1998)
2961         random2 = random.randint(1000,9999)
2962
2963         return "%d%d%d" %(nowTime,random1,random2)
2964
2965     def _get_file_ID_mix_string(self, seed):
2966         mixed = []
2967         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2968         seed = float(seed)
2969         for i in range(len(source)):
2970             seed  =  (seed * 211 + 30031 ) % 65536
2971             index  =  math.floor(seed / 65536 * len(source) )
2972             mixed.append(source[int(index)])
2973             source.remove(source[int(index)])
2974         #return ''.join(mixed)
2975         return mixed
2976
2977     def _get_file_id(self, fileId, seed):
2978         mixed = self._get_file_ID_mix_string(seed)
2979         ids = fileId.split('*')
2980         realId = []
2981         for ch in ids:
2982             if ch:
2983                 realId.append(mixed[int(ch)])
2984         return ''.join(realId)
2985
2986     def _real_extract(self, url):
2987         mobj = re.match(self._VALID_URL, url)
2988         if mobj is None:
2989             raise ExtractorError(u'Invalid URL: %s' % url)
2990         video_id = mobj.group('ID')
2991
2992         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2993
2994         jsondata = self._download_webpage(info_url, video_id)
2995
2996         self.report_extraction(video_id)
2997         try:
2998             config = json.loads(jsondata)
2999
3000             video_title =  config['data'][0]['title']
3001             seed = config['data'][0]['seed']
3002
3003             format = self._downloader.params.get('format', None)
3004             supported_format = list(config['data'][0]['streamfileids'].keys())
3005
3006             if format is None or format == 'best':
3007                 if 'hd2' in supported_format:
3008                     format = 'hd2'
3009                 else:
3010                     format = 'flv'
3011                 ext = u'flv'
3012             elif format == 'worst':
3013                 format = 'mp4'
3014                 ext = u'mp4'
3015             else:
3016                 format = 'flv'
3017                 ext = u'flv'
3018
3019
3020             fileid = config['data'][0]['streamfileids'][format]
3021             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3022         except (UnicodeDecodeError, ValueError, KeyError):
3023             raise ExtractorError(u'Unable to extract info section')
3024
3025         files_info=[]
3026         sid = self._gen_sid()
3027         fileid = self._get_file_id(fileid, seed)
3028
3029         #column 8,9 of fileid represent the segment number
3030         #fileid[7:9] should be changed
3031         for index, key in enumerate(keys):
3032
3033             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3034             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3035
3036             info = {
3037                 'id': '%s_part%02d' % (video_id, index),
3038                 'url': download_url,
3039                 'uploader': None,
3040                 'upload_date': None,
3041                 'title': video_title,
3042                 'ext': ext,
3043             }
3044             files_info.append(info)
3045
3046         return files_info
3047
3048
3049 class XNXXIE(InfoExtractor):
3050     """Information extractor for xnxx.com"""
3051
3052     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3053     IE_NAME = u'xnxx'
3054     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3055     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3056     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3057
3058     def _real_extract(self, url):
3059         mobj = re.match(self._VALID_URL, url)
3060         if mobj is None:
3061             raise ExtractorError(u'Invalid URL: %s' % url)
3062         video_id = mobj.group(1)
3063
3064         # Get webpage content
3065         webpage = self._download_webpage(url, video_id)
3066
3067         video_url = self._search_regex(self.VIDEO_URL_RE,
3068             webpage, u'video URL')
3069         video_url = compat_urllib_parse.unquote(video_url)
3070
3071         video_title = self._search_regex(self.VIDEO_TITLE_RE,
3072             webpage, u'title')
3073
3074         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3075             webpage, u'thumbnail', fatal=False)
3076
3077         return [{
3078             'id': video_id,
3079             'url': video_url,
3080             'uploader': None,
3081             'upload_date': None,
3082             'title': video_title,
3083             'ext': 'flv',
3084             'thumbnail': video_thumbnail,
3085             'description': None,
3086         }]
3087
3088
3089 class GooglePlusIE(InfoExtractor):
3090     """Information extractor for plus.google.com."""
3091
3092     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3093     IE_NAME = u'plus.google'
3094
3095     def _real_extract(self, url):
3096         # Extract id from URL
3097         mobj = re.match(self._VALID_URL, url)
3098         if mobj is None:
3099             raise ExtractorError(u'Invalid URL: %s' % url)
3100
3101         post_url = mobj.group(0)
3102         video_id = mobj.group(1)
3103
3104         video_extension = 'flv'
3105
3106         # Step 1, Retrieve post webpage to extract further information
3107         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3108
3109         self.report_extraction(video_id)
3110
3111         # Extract update date
3112         upload_date = self._search_regex('title="Timestamp">(.*?)</a>',
3113             webpage, u'upload date', fatal=False)
3114         if upload_date:
3115             # Convert timestring to a format suitable for filename
3116             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3117             upload_date = upload_date.strftime('%Y%m%d')
3118
3119         # Extract uploader
3120         uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>',
3121             webpage, u'uploader', fatal=False)
3122
3123         # Extract title
3124         # Get the first line for title
3125         video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3126             webpage, 'title', default=u'NA')
3127
3128         # Step 2, Stimulate clicking the image box to launch video
3129         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3130             webpage, u'video page URL')
3131         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3132
3133         # Extract video links on video page
3134         """Extract video links of all sizes"""
3135         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3136         mobj = re.findall(pattern, webpage)
3137         if len(mobj) == 0:
3138             raise ExtractorError(u'Unable to extract video links')
3139
3140         # Sort in resolution
3141         links = sorted(mobj)
3142
3143         # Choose the lowest of the sort, i.e. highest resolution
3144         video_url = links[-1]
3145         # Only get the url. The resolution part in the tuple has no use anymore
3146         video_url = video_url[-1]
3147         # Treat escaped \u0026 style hex
3148         try:
3149             video_url = video_url.decode("unicode_escape")
3150         except AttributeError: # Python 3
3151             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3152
3153
3154         return [{
3155             'id':       video_id,
3156             'url':      video_url,
3157             'uploader': uploader,
3158             'upload_date':  upload_date,
3159             'title':    video_title,
3160             'ext':      video_extension,
3161         }]
3162
3163 class NBAIE(InfoExtractor):
3164     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3165     IE_NAME = u'nba'
3166
3167     def _real_extract(self, url):
3168         mobj = re.match(self._VALID_URL, url)
3169         if mobj is None:
3170             raise ExtractorError(u'Invalid URL: %s' % url)
3171
3172         video_id = mobj.group(1)
3173         if video_id.endswith('/index.html'):
3174             video_id = video_id[:-len('/index.html')]
3175
3176         webpage = self._download_webpage(url, video_id)
3177
3178         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3179
3180         shortened_video_id = video_id.rpartition('/')[2]
3181         title = self._search_regex(r'<meta property="og:title" content="(.*?)"',
3182             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3183
3184         uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3185
3186         description = self._search_regex(r'<div class="description">(.*?)</h1>', webpage, 'description', fatal=False)
3187
3188         info = {
3189             'id': shortened_video_id,
3190             'url': video_url,
3191             'ext': 'mp4',
3192             'title': title,
3193             'uploader_date': uploader_date,
3194             'description': description,
3195         }
3196         return [info]
3197
3198 class JustinTVIE(InfoExtractor):
3199     """Information extractor for justin.tv and twitch.tv"""
3200     # TODO: One broadcast may be split into multiple videos. The key
3201     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3202     # starts at 1 and increases. Can we treat all parts as one video?
3203
3204     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3205         (?:
3206             (?P<channelid>[^/]+)|
3207             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3208             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3209         )
3210         /?(?:\#.*)?$
3211         """
3212     _JUSTIN_PAGE_LIMIT = 100
3213     IE_NAME = u'justin.tv'
3214
3215     def report_download_page(self, channel, offset):
3216         """Report attempt to download a single page of videos."""
3217         self.to_screen(u'%s: Downloading video information from %d to %d' %
3218                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3219
3220     # Return count of items, list of *valid* items
3221     def _parse_page(self, url, video_id):
3222         webpage = self._download_webpage(url, video_id,
3223                                          u'Downloading video info JSON',
3224                                          u'unable to download video info JSON')
3225
3226         response = json.loads(webpage)
3227         if type(response) != list:
3228             error_text = response.get('error', 'unknown error')
3229             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3230         info = []
3231         for clip in response:
3232             video_url = clip['video_file_url']
3233             if video_url:
3234                 video_extension = os.path.splitext(video_url)[1][1:]
3235                 video_date = re.sub('-', '', clip['start_time'][:10])
3236                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3237                 video_id = clip['id']
3238                 video_title = clip.get('title', video_id)
3239                 info.append({
3240                     'id': video_id,
3241                     'url': video_url,
3242                     'title': video_title,
3243                     'uploader': clip.get('channel_name', video_uploader_id),
3244                     'uploader_id': video_uploader_id,
3245                     'upload_date': video_date,
3246                     'ext': video_extension,
3247                 })
3248         return (len(response), info)
3249
3250     def _real_extract(self, url):
3251         mobj = re.match(self._VALID_URL, url)
3252         if mobj is None:
3253             raise ExtractorError(u'invalid URL: %s' % url)
3254
3255         api_base = 'http://api.justin.tv'
3256         paged = False
3257         if mobj.group('channelid'):
3258             paged = True
3259             video_id = mobj.group('channelid')
3260             api = api_base + '/channel/archives/%s.json' % video_id
3261         elif mobj.group('chapterid'):
3262             chapter_id = mobj.group('chapterid')
3263
3264             webpage = self._download_webpage(url, chapter_id)
3265             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3266             if not m:
3267                 raise ExtractorError(u'Cannot find archive of a chapter')
3268             archive_id = m.group(1)
3269
3270             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3271             chapter_info_xml = self._download_webpage(api, chapter_id,
3272                                              note=u'Downloading chapter information',
3273                                              errnote=u'Chapter information download failed')
3274             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3275             for a in doc.findall('.//archive'):
3276                 if archive_id == a.find('./id').text:
3277                     break
3278             else:
3279                 raise ExtractorError(u'Could not find chapter in chapter information')
3280
3281             video_url = a.find('./video_file_url').text
3282             video_ext = video_url.rpartition('.')[2] or u'flv'
3283
3284             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3285             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3286                                    note='Downloading chapter metadata',
3287                                    errnote='Download of chapter metadata failed')
3288             chapter_info = json.loads(chapter_info_json)
3289
3290             bracket_start = int(doc.find('.//bracket_start').text)
3291             bracket_end = int(doc.find('.//bracket_end').text)
3292
3293             # TODO determine start (and probably fix up file)
3294             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3295             #video_url += u'?start=' + TODO:start_timestamp
3296             # bracket_start is 13290, but we want 51670615
3297             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3298                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3299
3300             info = {
3301                 'id': u'c' + chapter_id,
3302                 'url': video_url,
3303                 'ext': video_ext,
3304                 'title': chapter_info['title'],
3305                 'thumbnail': chapter_info['preview'],
3306                 'description': chapter_info['description'],
3307                 'uploader': chapter_info['channel']['display_name'],
3308                 'uploader_id': chapter_info['channel']['name'],
3309             }
3310             return [info]
3311         else:
3312             video_id = mobj.group('videoid')
3313             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3314
3315         self.report_extraction(video_id)
3316
3317         info = []
3318         offset = 0
3319         limit = self._JUSTIN_PAGE_LIMIT
3320         while True:
3321             if paged:
3322                 self.report_download_page(video_id, offset)
3323             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3324             page_count, page_info = self._parse_page(page_url, video_id)
3325             info.extend(page_info)
3326             if not paged or page_count != limit:
3327                 break
3328             offset += limit
3329         return info
3330
3331 class FunnyOrDieIE(InfoExtractor):
3332     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3333
3334     def _real_extract(self, url):
3335         mobj = re.match(self._VALID_URL, url)
3336         if mobj is None:
3337             raise ExtractorError(u'invalid URL: %s' % url)
3338
3339         video_id = mobj.group('id')
3340         webpage = self._download_webpage(url, video_id)
3341
3342         video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3343             webpage, u'video URL', flags=re.DOTALL)
3344         video_url = unescapeHTML(video_url)
3345
3346         title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3347             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3348         title = clean_html(title)
3349
3350         video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3351             webpage, u'description', flags=re.DOTALL)
3352         if video_description: video_description = unescapeHTML(video_description)
3353
3354         info = {
3355             'id': video_id,
3356             'url': video_url,
3357             'ext': 'mp4',
3358             'title': title,
3359             'description': video_description,
3360         }
3361         return [info]
3362
3363 class SteamIE(InfoExtractor):
3364     _VALID_URL = r"""http://store\.steampowered\.com/
3365                 (agecheck/)?
3366                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3367                 (?P<gameID>\d+)/?
3368                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3369                 """
3370
3371     @classmethod
3372     def suitable(cls, url):
3373         """Receives a URL and returns True if suitable for this IE."""
3374         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3375
3376     def _real_extract(self, url):
3377         m = re.match(self._VALID_URL, url, re.VERBOSE)
3378         gameID = m.group('gameID')
3379         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3380         self.report_age_confirmation()
3381         webpage = self._download_webpage(videourl, gameID)
3382         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3383
3384         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3385         mweb = re.finditer(urlRE, webpage)
3386         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3387         titles = re.finditer(namesRE, webpage)
3388         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3389         thumbs = re.finditer(thumbsRE, webpage)
3390         videos = []
3391         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3392             video_id = vid.group('videoID')
3393             title = vtitle.group('videoName')
3394             video_url = vid.group('videoURL')
3395             video_thumb = thumb.group('thumbnail')
3396             if not video_url:
3397                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3398             info = {
3399                 'id':video_id,
3400                 'url':video_url,
3401                 'ext': 'flv',
3402                 'title': unescapeHTML(title),
3403                 'thumbnail': video_thumb
3404                   }
3405             videos.append(info)
3406         return [self.playlist_result(videos, gameID, game_title)]
3407
3408 class UstreamIE(InfoExtractor):
3409     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3410     IE_NAME = u'ustream'
3411
3412     def _real_extract(self, url):
3413         m = re.match(self._VALID_URL, url)
3414         video_id = m.group('videoID')
3415
3416         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3417         webpage = self._download_webpage(url, video_id)
3418
3419         self.report_extraction(video_id)
3420
3421         video_title = self._search_regex(r'data-title="(?P<title>.+)"',
3422             webpage, u'title')
3423
3424         uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3425             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3426         if uploader: uploader = unescapeHTML(uploader.strip())
3427
3428         thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3429             webpage, u'thumbnail', fatal=False)
3430
3431         info = {
3432                 'id': video_id,
3433                 'url': video_url,
3434                 'ext': 'flv',
3435                 'title': video_title,
3436                 'uploader': uploader,
3437                 'thumbnail': thumbnail,
3438                }
3439         return info
3440
3441 class WorldStarHipHopIE(InfoExtractor):
3442     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3443     IE_NAME = u'WorldStarHipHop'
3444
3445     def _real_extract(self, url):
3446         m = re.match(self._VALID_URL, url)
3447         video_id = m.group('id')
3448
3449         webpage_src = self._download_webpage(url, video_id)
3450
3451         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3452             webpage_src, u'video URL')
3453
3454         if 'mp4' in video_url:
3455             ext = 'mp4'
3456         else:
3457             ext = 'flv'
3458
3459         video_title = self._search_regex(r"<title>(.*)</title>",
3460             webpage_src, u'title')
3461
3462         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3463         thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />',
3464             webpage_src, u'thumbnail', fatal=False)
3465
3466         if not thumbnail:
3467             _title = r"""candytitles.*>(.*)</span>"""
3468             mobj = re.search(_title, webpage_src)
3469             if mobj is not None:
3470                 video_title = mobj.group(1)
3471
3472         results = [{
3473                     'id': video_id,
3474                     'url' : video_url,
3475                     'title' : video_title,
3476                     'thumbnail' : thumbnail,
3477                     'ext' : ext,
3478                     }]
3479         return results
3480
3481 class RBMARadioIE(InfoExtractor):
3482     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3483
3484     def _real_extract(self, url):
3485         m = re.match(self._VALID_URL, url)
3486         video_id = m.group('videoID')
3487
3488         webpage = self._download_webpage(url, video_id)
3489
3490         json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
3491             webpage, u'json data')
3492
3493         try:
3494             data = json.loads(json_data)
3495         except ValueError as e:
3496             raise ExtractorError(u'Invalid JSON: ' + str(e))
3497
3498         video_url = data['akamai_url'] + '&cbr=256'
3499         url_parts = compat_urllib_parse_urlparse(video_url)
3500         video_ext = url_parts.path.rpartition('.')[2]
3501         info = {
3502                 'id': video_id,
3503                 'url': video_url,
3504                 'ext': video_ext,
3505                 'title': data['title'],
3506                 'description': data.get('teaser_text'),
3507                 'location': data.get('country_of_origin'),
3508                 'uploader': data.get('host', {}).get('name'),
3509                 'uploader_id': data.get('host', {}).get('slug'),
3510                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3511                 'duration': data.get('duration'),
3512         }
3513         return [info]
3514
3515
3516 class YouPornIE(InfoExtractor):
3517     """Information extractor for youporn.com."""
3518     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3519
3520     def _print_formats(self, formats):
3521         """Print all available formats"""
3522         print(u'Available formats:')
3523         print(u'ext\t\tformat')
3524         print(u'---------------------------------')
3525         for format in formats:
3526             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3527
3528     def _specific(self, req_format, formats):
3529         for x in formats:
3530             if(x["format"]==req_format):
3531                 return x
3532         return None
3533
3534     def _real_extract(self, url):
3535         mobj = re.match(self._VALID_URL, url)
3536         if mobj is None:
3537             raise ExtractorError(u'Invalid URL: %s' % url)
3538         video_id = mobj.group('videoid')
3539
3540         req = compat_urllib_request.Request(url)
3541         req.add_header('Cookie', 'age_verified=1')
3542         webpage = self._download_webpage(req, video_id)
3543
3544         # Get the video title
3545         video_title = self._search_regex(r'<h1.*?>(?P<title>.*)</h1>',
3546             webpage, u'title').strip()
3547
3548         # Get the video date
3549         upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>',
3550             webpage, u'upload date', fatal=False)
3551         if upload_date: upload_date = unified_strdate(upload_date.strip())
3552
3553         # Get the video uploader
3554         video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>',
3555             webpage, u'uploader', fatal=False)
3556         if video_uploader: video_uploader = clean_html(video_uploader.strip())
3557
3558         # Get all of the formats available
3559         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3560         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3561             webpage, u'download list').strip()
3562
3563         # Get all of the links from the page
3564         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3565         links = re.findall(LINK_RE, download_list_html)
3566         if(len(links) == 0):
3567             raise ExtractorError(u'ERROR: no known formats available for video')
3568
3569         self.to_screen(u'Links found: %d' % len(links))
3570
3571         formats = []
3572         for link in links:
3573
3574             # A link looks like this:
3575             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3576             # A path looks like this:
3577             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3578             video_url = unescapeHTML( link )
3579             path = compat_urllib_parse_urlparse( video_url ).path
3580             extension = os.path.splitext( path )[1][1:]
3581             format = path.split('/')[4].split('_')[:2]
3582             size = format[0]
3583             bitrate = format[1]
3584             format = "-".join( format )
3585             title = u'%s-%s-%s' % (video_title, size, bitrate)
3586
3587             formats.append({
3588                 'id': video_id,
3589                 'url': video_url,
3590                 'uploader': video_uploader,
3591                 'upload_date': upload_date,
3592                 'title': title,
3593                 'ext': extension,
3594                 'format': format,
3595                 'thumbnail': None,
3596                 'description': None,
3597                 'player_url': None
3598             })
3599
3600         if self._downloader.params.get('listformats', None):
3601             self._print_formats(formats)
3602             return
3603
3604         req_format = self._downloader.params.get('format', None)
3605         self.to_screen(u'Format: %s' % req_format)
3606
3607         if req_format is None or req_format == 'best':
3608             return [formats[0]]
3609         elif req_format == 'worst':
3610             return [formats[-1]]
3611         elif req_format in ('-1', 'all'):
3612             return formats
3613         else:
3614             format = self._specific( req_format, formats )
3615             if result is None:
3616                 raise ExtractorError(u'Requested format not available')
3617             return [format]
3618
3619
3620
3621 class PornotubeIE(InfoExtractor):
3622     """Information extractor for pornotube.com."""
3623     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3624
3625     def _real_extract(self, url):
3626         mobj = re.match(self._VALID_URL, url)
3627         if mobj is None:
3628             raise ExtractorError(u'Invalid URL: %s' % url)
3629
3630         video_id = mobj.group('videoid')
3631         video_title = mobj.group('title')
3632
3633         # Get webpage content
3634         webpage = self._download_webpage(url, video_id)
3635
3636         # Get the video URL
3637         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3638         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3639         video_url = compat_urllib_parse.unquote(video_url)
3640
3641         #Get the uploaded date
3642         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3643         upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3644         if upload_date: upload_date = unified_strdate(upload_date)
3645
3646         info = {'id': video_id,
3647                 'url': video_url,
3648                 'uploader': None,
3649                 'upload_date': upload_date,
3650                 'title': video_title,
3651                 'ext': 'flv',
3652                 'format': 'flv'}
3653
3654         return [info]
3655
3656 class YouJizzIE(InfoExtractor):
3657     """Information extractor for youjizz.com."""
3658     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3659
3660     def _real_extract(self, url):
3661         mobj = re.match(self._VALID_URL, url)
3662         if mobj is None:
3663             raise ExtractorError(u'Invalid URL: %s' % url)
3664
3665         video_id = mobj.group('videoid')
3666
3667         # Get webpage content
3668         webpage = self._download_webpage(url, video_id)
3669
3670         # Get the video title
3671         video_title = self._search_regex(r'<title>(?P<title>.*)</title>',
3672             webpage, u'title').strip()
3673
3674         # Get the embed page
3675         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3676         if result is None:
3677             raise ExtractorError(u'ERROR: unable to extract embed page')
3678
3679         embed_page_url = result.group(0).strip()
3680         video_id = result.group('videoid')
3681
3682         webpage = self._download_webpage(embed_page_url, video_id)
3683
3684         # Get the video URL
3685         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3686             webpage, u'video URL')
3687
3688         info = {'id': video_id,
3689                 'url': video_url,
3690                 'title': video_title,
3691                 'ext': 'flv',
3692                 'format': 'flv',
3693                 'player_url': embed_page_url}
3694
3695         return [info]
3696
3697 class EightTracksIE(InfoExtractor):
3698     IE_NAME = '8tracks'
3699     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3700
3701     def _real_extract(self, url):
3702         mobj = re.match(self._VALID_URL, url)
3703         if mobj is None:
3704             raise ExtractorError(u'Invalid URL: %s' % url)
3705         playlist_id = mobj.group('id')
3706
3707         webpage = self._download_webpage(url, playlist_id)
3708
3709         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3710         data = json.loads(json_like)
3711
3712         session = str(random.randint(0, 1000000000))
3713         mix_id = data['id']
3714         track_count = data['tracks_count']
3715         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3716         next_url = first_url
3717         res = []
3718         for i in itertools.count():
3719             api_json = self._download_webpage(next_url, playlist_id,
3720                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3721                 errnote=u'Failed to download song information')
3722             api_data = json.loads(api_json)
3723             track_data = api_data[u'set']['track']
3724             info = {
3725                 'id': track_data['id'],
3726                 'url': track_data['track_file_stream_url'],
3727                 'title': track_data['performer'] + u' - ' + track_data['name'],
3728                 'raw_title': track_data['name'],
3729                 'uploader_id': data['user']['login'],
3730                 'ext': 'm4a',
3731             }
3732             res.append(info)
3733             if api_data['set']['at_last_track']:
3734                 break
3735             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3736         return res
3737
3738 class KeekIE(InfoExtractor):
3739     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3740     IE_NAME = u'keek'
3741
3742     def _real_extract(self, url):
3743         m = re.match(self._VALID_URL, url)
3744         video_id = m.group('videoID')
3745
3746         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3747         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3748         webpage = self._download_webpage(url, video_id)
3749
3750         video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3751             webpage, u'title')
3752         video_title = unescapeHTML(video_title)
3753
3754         uploader = self._search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3755             webpage, u'uploader', fatal=False)
3756         if uploader: uploader = clean_html(uploader)
3757
3758         info = {
3759                 'id': video_id,
3760                 'url': video_url,
3761                 'ext': 'mp4',
3762                 'title': video_title,
3763                 'thumbnail': thumbnail,
3764                 'uploader': uploader
3765         }
3766         return [info]
3767
3768 class TEDIE(InfoExtractor):
3769     _VALID_URL=r'''http://www\.ted\.com/
3770                    (
3771                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3772                         |
3773                         ((?P<type_talk>talks)) # We have a simple talk
3774                    )
3775                    (/lang/(.*?))? # The url may contain the language
3776                    /(?P<name>\w+) # Here goes the name and then ".html"
3777                    '''
3778
3779     @classmethod
3780     def suitable(cls, url):
3781         """Receives a URL and returns True if suitable for this IE."""
3782         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3783
3784     def _real_extract(self, url):
3785         m=re.match(self._VALID_URL, url, re.VERBOSE)
3786         if m.group('type_talk'):
3787             return [self._talk_info(url)]
3788         else :
3789             playlist_id=m.group('playlist_id')
3790             name=m.group('name')
3791             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3792             return [self._playlist_videos_info(url,name,playlist_id)]
3793
3794     def _talk_video_link(self,mediaSlug):
3795         '''Returns the video link for that mediaSlug'''
3796         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3797
3798     def _playlist_videos_info(self,url,name,playlist_id=0):
3799         '''Returns the videos of the playlist'''
3800         video_RE=r'''
3801                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3802                      ([.\s]*?)data-playlist_item_id="(\d+)"
3803                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3804                      '''
3805         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3806         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3807         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3808         m_names=re.finditer(video_name_RE,webpage)
3809
3810         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3811         m_playlist = re.search(playlist_RE, webpage)
3812         playlist_title = m_playlist.group('playlist_title')
3813
3814         playlist_entries = []
3815         for m_video, m_name in zip(m_videos,m_names):
3816             video_id=m_video.group('video_id')
3817             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3818             playlist_entries.append(self.url_result(talk_url, 'TED'))
3819         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3820
3821     def _talk_info(self, url, video_id=0):
3822         """Return the video for the talk in the url"""
3823         m=re.match(self._VALID_URL, url,re.VERBOSE)
3824         videoName=m.group('name')
3825         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3826         # If the url includes the language we get the title translated
3827         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3828         title=re.search(title_RE, webpage).group('title')
3829         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3830                         "id":(?P<videoID>[\d]+).*?
3831                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3832         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3833         thumb_match=re.search(thumb_RE,webpage)
3834         info_match=re.search(info_RE,webpage,re.VERBOSE)
3835         video_id=info_match.group('videoID')
3836         mediaSlug=info_match.group('mediaSlug')
3837         video_url=self._talk_video_link(mediaSlug)
3838         info = {
3839                 'id': video_id,
3840                 'url': video_url,
3841                 'ext': 'mp4',
3842                 'title': title,
3843                 'thumbnail': thumb_match.group('thumbnail')
3844                 }
3845         return info
3846
3847 class MySpassIE(InfoExtractor):
3848     _VALID_URL = r'http://www.myspass.de/.*'
3849
3850     def _real_extract(self, url):
3851         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3852
3853         # video id is the last path element of the URL
3854         # usually there is a trailing slash, so also try the second but last
3855         url_path = compat_urllib_parse_urlparse(url).path
3856         url_parent_path, video_id = os.path.split(url_path)
3857         if not video_id:
3858             _, video_id = os.path.split(url_parent_path)
3859
3860         # get metadata
3861         metadata_url = META_DATA_URL_TEMPLATE % video_id
3862         metadata_text = self._download_webpage(metadata_url, video_id)
3863         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3864
3865         # extract values from metadata
3866         url_flv_el = metadata.find('url_flv')
3867         if url_flv_el is None:
3868             raise ExtractorError(u'Unable to extract download url')
3869         video_url = url_flv_el.text
3870         extension = os.path.splitext(video_url)[1][1:]
3871         title_el = metadata.find('title')
3872         if title_el is None:
3873             raise ExtractorError(u'Unable to extract title')
3874         title = title_el.text
3875         format_id_el = metadata.find('format_id')
3876         if format_id_el is None:
3877             format = ext
3878         else:
3879             format = format_id_el.text
3880         description_el = metadata.find('description')
3881         if description_el is not None:
3882             description = description_el.text
3883         else:
3884             description = None
3885         imagePreview_el = metadata.find('imagePreview')
3886         if imagePreview_el is not None:
3887             thumbnail = imagePreview_el.text
3888         else:
3889             thumbnail = None
3890         info = {
3891             'id': video_id,
3892             'url': video_url,
3893             'title': title,
3894             'ext': extension,
3895             'format': format,
3896             'thumbnail': thumbnail,
3897             'description': description
3898         }
3899         return [info]
3900
3901 class SpiegelIE(InfoExtractor):
3902     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3903
3904     def _real_extract(self, url):
3905         m = re.match(self._VALID_URL, url)
3906         video_id = m.group('videoID')
3907
3908         webpage = self._download_webpage(url, video_id)
3909
3910         video_title = self._search_regex(r'<div class="module-title">(.*?)</div>',
3911             webpage, u'title')
3912         video_title = unescapeHTML(video_title)
3913
3914         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3915         xml_code = self._download_webpage(xml_url, video_id,
3916                     note=u'Downloading XML', errnote=u'Failed to download XML')
3917
3918         idoc = xml.etree.ElementTree.fromstring(xml_code)
3919         last_type = idoc[-1]
3920         filename = last_type.findall('./filename')[0].text
3921         duration = float(last_type.findall('./duration')[0].text)
3922
3923         video_url = 'http://video2.spiegel.de/flash/' + filename
3924         video_ext = filename.rpartition('.')[2]
3925         info = {
3926             'id': video_id,
3927             'url': video_url,
3928             'ext': video_ext,
3929             'title': video_title,
3930             'duration': duration,
3931         }
3932         return [info]
3933
3934 class LiveLeakIE(InfoExtractor):
3935
3936     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3937     IE_NAME = u'liveleak'
3938
3939     def _real_extract(self, url):
3940         mobj = re.match(self._VALID_URL, url)
3941         if mobj is None:
3942             raise ExtractorError(u'Invalid URL: %s' % url)
3943
3944         video_id = mobj.group('video_id')
3945
3946         webpage = self._download_webpage(url, video_id)
3947
3948         video_url = self._search_regex(r'file: "(.*?)",',
3949             webpage, u'video URL')
3950
3951         video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3952             webpage, u'title')
3953         video_title = unescapeHTML(video_title).replace('LiveLeak.com -', '').strip()
3954
3955         video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3956             webpage, u'description', fatal=False)
3957         if video_description: video_description = unescapeHTML(video_description)
3958
3959         video_uploader = self._search_regex(r'By:.*?(\w+)</a>',
3960             webpage, u'uploader', fatal=False)
3961
3962         info = {
3963             'id':  video_id,
3964             'url': video_url,
3965             'ext': 'mp4',
3966             'title': video_title,
3967             'description': video_description,
3968             'uploader': video_uploader
3969         }
3970
3971         return [info]
3972
3973 class ARDIE(InfoExtractor):
3974     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3975     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3976     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3977
3978     def _real_extract(self, url):
3979         # determine video id from url
3980         m = re.match(self._VALID_URL, url)
3981
3982         numid = re.search(r'documentId=([0-9]+)', url)
3983         if numid:
3984             video_id = numid.group(1)
3985         else:
3986             video_id = m.group('video_id')
3987
3988         # determine title and media streams from webpage
3989         html = self._download_webpage(url, video_id)
3990         title = re.search(self._TITLE, html).group('title')
3991         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3992         if not streams:
3993             assert '"fsk"' in html
3994             raise ExtractorError(u'This video is only available after 8:00 pm')
3995
3996         # choose default media type and highest quality for now
3997         stream = max([s for s in streams if int(s["media_type"]) == 0],
3998                      key=lambda s: int(s["quality"]))
3999
4000         # there's two possibilities: RTMP stream or HTTP download
4001         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4002         if stream['rtmp_url']:
4003             self.to_screen(u'RTMP download detected')
4004             assert stream['video_url'].startswith('mp4:')
4005             info["url"] = stream["rtmp_url"]
4006             info["play_path"] = stream['video_url']
4007         else:
4008             assert stream["video_url"].endswith('.mp4')
4009             info["url"] = stream["video_url"]
4010         return [info]
4011
4012 class TumblrIE(InfoExtractor):
4013     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4014
4015     def _real_extract(self, url):
4016         m_url = re.match(self._VALID_URL, url)
4017         video_id = m_url.group('id')
4018         blog = m_url.group('blog_name')
4019
4020         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4021         webpage = self._download_webpage(url, video_id)
4022
4023         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4024         video = re.search(re_video, webpage)
4025         if video is None:
4026            raise ExtractorError(u'Unable to extract video')
4027         video_url = video.group('video_url')
4028         ext = video.group('ext')
4029
4030         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4031             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4032         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4033
4034         # The only place where you can get a title, it's not complete,
4035         # but searching in other places doesn't work for all videos
4036         video_title = self._search_regex(r'<title>(?P<title>.*?)</title>',
4037             webpage, u'title', flags=re.DOTALL)
4038         video_title = unescapeHTML(video_title)
4039
4040         return [{'id': video_id,
4041                  'url': video_url,
4042                  'title': video_title,
4043                  'thumbnail': video_thumbnail,
4044                  'ext': ext
4045                  }]
4046
4047 class BandcampIE(InfoExtractor):
4048     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4049
4050     def _real_extract(self, url):
4051         mobj = re.match(self._VALID_URL, url)
4052         title = mobj.group('title')
4053         webpage = self._download_webpage(url, title)
4054         # We get the link to the free download page
4055         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4056         if m_download is None:
4057             raise ExtractorError(u'No free songs found')
4058
4059         download_link = m_download.group(1)
4060         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4061                        webpage, re.MULTILINE|re.DOTALL).group('id')
4062
4063         download_webpage = self._download_webpage(download_link, id,
4064                                                   'Downloading free downloads page')
4065         # We get the dictionary of the track from some javascrip code
4066         info = re.search(r'items: (.*?),$',
4067                          download_webpage, re.MULTILINE).group(1)
4068         info = json.loads(info)[0]
4069         # We pick mp3-320 for now, until format selection can be easily implemented.
4070         mp3_info = info[u'downloads'][u'mp3-320']
4071         # If we try to use this url it says the link has expired
4072         initial_url = mp3_info[u'url']
4073         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4074         m_url = re.match(re_url, initial_url)
4075         #We build the url we will use to get the final track url
4076         # This url is build in Bandcamp in the script download_bunde_*.js
4077         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4078         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4079         # If we could correctly generate the .rand field the url would be
4080         #in the "download_url" key
4081         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4082
4083         track_info = {'id':id,
4084                       'title' : info[u'title'],
4085                       'ext' :   'mp3',
4086                       'url' :   final_url,
4087                       'thumbnail' : info[u'thumb_url'],
4088                       'uploader' :  info[u'artist']
4089                       }
4090
4091         return [track_info]
4092
4093 class RedTubeIE(InfoExtractor):
4094     """Information Extractor for redtube"""
4095     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4096
4097     def _real_extract(self,url):
4098         mobj = re.match(self._VALID_URL, url)
4099         if mobj is None:
4100             raise ExtractorError(u'Invalid URL: %s' % url)
4101
4102         video_id = mobj.group('id')
4103         video_extension = 'mp4'
4104         webpage = self._download_webpage(url, video_id)
4105
4106         self.report_extraction(video_id)
4107
4108         video_url = self._search_regex(r'<source src="(.+?)" type="video/mp4">',
4109             webpage, u'video URL')
4110
4111         video_title = self._search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4112             webpage, u'title')
4113
4114         return [{
4115             'id':       video_id,
4116             'url':      video_url,
4117             'ext':      video_extension,
4118             'title':    video_title,
4119         }]
4120
4121 class InaIE(InfoExtractor):
4122     """Information Extractor for Ina.fr"""
4123     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4124
4125     def _real_extract(self,url):
4126         mobj = re.match(self._VALID_URL, url)
4127
4128         video_id = mobj.group('id')
4129         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4130         video_extension = 'mp4'
4131         webpage = self._download_webpage(mrss_url, video_id)
4132
4133         self.report_extraction(video_id)
4134
4135         video_url = self._search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4136             webpage, u'video URL')
4137
4138         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4139             webpage, u'title')
4140
4141         return [{
4142             'id':       video_id,
4143             'url':      video_url,
4144             'ext':      video_extension,
4145             'title':    video_title,
4146         }]
4147
4148 class HowcastIE(InfoExtractor):
4149     """Information Extractor for Howcast.com"""
4150     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4151
4152     def _real_extract(self, url):
4153         mobj = re.match(self._VALID_URL, url)
4154
4155         video_id = mobj.group('id')
4156         webpage_url = 'http://www.howcast.com/videos/' + video_id
4157         webpage = self._download_webpage(webpage_url, video_id)
4158
4159         self.report_extraction(video_id)
4160
4161         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4162             webpage, u'video URL')
4163
4164         video_title = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4165             webpage, u'title')
4166
4167         video_description = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4168             webpage, u'description', fatal=False)
4169
4170         thumbnail = self._search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4171             webpage, u'thumbnail', fatal=False)
4172
4173         return [{
4174             'id':       video_id,
4175             'url':      video_url,
4176             'ext':      'mp4',
4177             'title':    video_title,
4178             'description': video_description,
4179             'thumbnail': thumbnail,
4180         }]
4181
4182 class VineIE(InfoExtractor):
4183     """Information Extractor for Vine.co"""
4184     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4185
4186     def _real_extract(self, url):
4187         mobj = re.match(self._VALID_URL, url)
4188
4189         video_id = mobj.group('id')
4190         webpage_url = 'https://vine.co/v/' + video_id
4191         webpage = self._download_webpage(webpage_url, video_id)
4192
4193         self.report_extraction(video_id)
4194
4195         video_url = self._search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4196             webpage, u'video URL')
4197
4198         video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
4199             webpage, u'title')
4200
4201         thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4202             webpage, u'thumbnail', fatal=False)
4203
4204         uploader = self._search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4205             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4206
4207         return [{
4208             'id':        video_id,
4209             'url':       video_url,
4210             'ext':       'mp4',
4211             'title':     video_title,
4212             'thumbnail': thumbnail,
4213             'uploader':  uploader,
4214         }]
4215
4216 class FlickrIE(InfoExtractor):
4217     """Information Extractor for Flickr videos"""
4218     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4219
4220     def _real_extract(self, url):
4221         mobj = re.match(self._VALID_URL, url)
4222
4223         video_id = mobj.group('id')
4224         video_uploader_id = mobj.group('uploader_id')
4225         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4226         webpage = self._download_webpage(webpage_url, video_id)
4227
4228         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4229
4230         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4231         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4232
4233         node_id = self._search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4234             first_xml, u'node_id')
4235
4236         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4237         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4238
4239         self.report_extraction(video_id)
4240
4241         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4242         if mobj is None:
4243             raise ExtractorError(u'Unable to extract video url')
4244         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4245
4246         video_title = self._search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4247             webpage, u'video title')
4248
4249         video_description = self._search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4250             webpage, u'description', fatal=False)
4251
4252         thumbnail = self._search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4253             webpage, u'thumbnail', fatal=False)
4254
4255         return [{
4256             'id':          video_id,
4257             'url':         video_url,
4258             'ext':         'mp4',
4259             'title':       video_title,
4260             'description': video_description,
4261             'thumbnail':   thumbnail,
4262             'uploader_id': video_uploader_id,
4263         }]
4264
4265 class TeamcocoIE(InfoExtractor):
4266     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4267
4268     def _real_extract(self, url):
4269         mobj = re.match(self._VALID_URL, url)
4270         if mobj is None:
4271             raise ExtractorError(u'Invalid URL: %s' % url)
4272         url_title = mobj.group('url_title')
4273         webpage = self._download_webpage(url, url_title)
4274
4275         video_id = self._search_regex(r'<article class="video" data-id="(\d+?)"',
4276             webpage, u'video id')
4277
4278         self.report_extraction(video_id)
4279
4280         video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
4281             webpage, u'title')
4282
4283         thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)"',
4284             webpage, u'thumbnail', fatal=False)
4285
4286         video_description = self._search_regex(r'<meta property="og:description" content="(.*?)"',
4287             webpage, u'description', fatal=False)
4288
4289         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4290         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4291
4292         video_url = self._search_regex(r'<file type="high".*?>(.*?)</file>',
4293             data, u'video URL')
4294
4295         return [{
4296             'id':          video_id,
4297             'url':         video_url,
4298             'ext':         'mp4',
4299             'title':       video_title,
4300             'thumbnail':   thumbnail,
4301             'description': video_description,
4302         }]
4303
4304 class XHamsterIE(InfoExtractor):
4305     """Information Extractor for xHamster"""
4306     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4307
4308     def _real_extract(self,url):
4309         mobj = re.match(self._VALID_URL, url)
4310
4311         video_id = mobj.group('id')
4312         mrss_url='http://xhamster.com/movies/%s/.html' % video_id
4313         webpage = self._download_webpage(mrss_url, video_id)
4314         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4315         if mobj is None:
4316             raise ExtractorError(u'Unable to extract media URL')
4317         if len(mobj.group('server')) == 0:
4318             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4319         else:
4320             video_url = mobj.group('server')+'/key='+mobj.group('file')
4321         video_extension = video_url.split('.')[-1]
4322
4323         mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
4324         if mobj is None:
4325             raise ExtractorError(u'Unable to extract title')
4326         video_title = unescapeHTML(mobj.group('title'))
4327
4328         mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
4329         if mobj is None:
4330             video_description = u''
4331         else:
4332             video_description = unescapeHTML(mobj.group('description'))
4333
4334         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4335         if mobj is None:
4336             raise ExtractorError(u'Unable to extract upload date')
4337         video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4338
4339         mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
4340         if mobj is None:
4341             video_uploader_id = u'anonymous'
4342         else:
4343             video_uploader_id = mobj.group('uploader_id')
4344
4345         mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
4346         if mobj is None:
4347             raise ExtractorError(u'Unable to extract thumbnail URL')
4348         video_thumbnail = mobj.group('thumbnail')
4349
4350         return [{
4351             'id':       video_id,
4352             'url':      video_url,
4353             'ext':      video_extension,
4354             'title':    video_title,
4355             'description': video_description,
4356             'upload_date': video_upload_date,
4357             'uploader_id': video_uploader_id,
4358             'thumbnail': video_thumbnail
4359         }]
4360
4361 class HypemIE(InfoExtractor):
4362     """Information Extractor for hypem"""
4363     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4364
4365     def _real_extract(self, url):
4366         mobj = re.match(self._VALID_URL, url)
4367         if mobj is None:
4368             raise ExtractorError(u'Invalid URL: %s' % url)
4369         track_id = mobj.group(1)
4370
4371         data = { 'ax': 1, 'ts': time.time() }
4372         data_encoded = compat_urllib_parse.urlencode(data)
4373         complete_url = url + "?" + data_encoded
4374         request = compat_urllib_request.Request(complete_url)
4375         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4376         cookie = urlh.headers.get('Set-Cookie', '')
4377
4378         self.report_extraction(track_id)
4379         mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
4380         if mobj is None:
4381             raise ExtractorError(u'Unable to extrack tracks')
4382         html_tracks = mobj.group(1).strip()
4383         try:
4384             track_list = json.loads(html_tracks)
4385             track = track_list[u'tracks'][0]
4386         except ValueError:
4387             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4388
4389         key = track[u"key"]
4390         track_id = track[u"id"]
4391         artist = track[u"artist"]
4392         title = track[u"song"]
4393
4394         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4395         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4396         request.add_header('cookie', cookie)
4397         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4398         try:
4399             song_data = json.loads(song_data_json)
4400         except ValueError:
4401             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4402         final_url = song_data[u"url"]
4403
4404         return [{
4405             'id':       track_id,
4406             'url':      final_url,
4407             'ext':      "mp3",
4408             'title':    title,
4409             'artist':   artist,
4410         }]
4411
4412
4413 def gen_extractors():
4414     """ Return a list of an instance of every supported extractor.
4415     The order does matter; the first extractor matched is the one handling the URL.
4416     """
4417     return [
4418         YoutubePlaylistIE(),
4419         YoutubeChannelIE(),
4420         YoutubeUserIE(),
4421         YoutubeSearchIE(),
4422         YoutubeIE(),
4423         MetacafeIE(),
4424         DailymotionIE(),
4425         GoogleSearchIE(),
4426         PhotobucketIE(),
4427         YahooIE(),
4428         YahooSearchIE(),
4429         DepositFilesIE(),
4430         FacebookIE(),
4431         BlipTVIE(),
4432         BlipTVUserIE(),
4433         VimeoIE(),
4434         MyVideoIE(),
4435         ComedyCentralIE(),
4436         EscapistIE(),
4437         CollegeHumorIE(),
4438         XVideosIE(),
4439         SoundcloudSetIE(),
4440         SoundcloudIE(),
4441         InfoQIE(),
4442         MixcloudIE(),
4443         StanfordOpenClassroomIE(),
4444         MTVIE(),
4445         YoukuIE(),
4446         XNXXIE(),
4447         YouJizzIE(),
4448         PornotubeIE(),
4449         YouPornIE(),
4450         GooglePlusIE(),
4451         ArteTvIE(),
4452         NBAIE(),
4453         WorldStarHipHopIE(),
4454         JustinTVIE(),
4455         FunnyOrDieIE(),
4456         SteamIE(),
4457         UstreamIE(),
4458         RBMARadioIE(),
4459         EightTracksIE(),
4460         KeekIE(),
4461         TEDIE(),
4462         MySpassIE(),
4463         SpiegelIE(),
4464         LiveLeakIE(),
4465         ARDIE(),
4466         TumblrIE(),
4467         BandcampIE(),
4468         RedTubeIE(),
4469         InaIE(),
4470         HowcastIE(),
4471         VineIE(),
4472         FlickrIE(),
4473         TeamcocoIE(),
4474         XHamsterIE(),
4475         HypemIE(),
4476         GenericIE()
4477     ]
4478
4479 def get_info_extractor(ie_name):
4480     """Returns the info extractor class with the given ie_name"""
4481     return globals()[ie_name+'IE']