youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 195         """
 196         Perform a regex search on the given string, using a single or a list of
 197         patterns returning the first matching group.
 198         In case of failure return a default value or raise a WARNING or a
 199         ExtractorError, depending on fatal, specifying the field name.
 200         """
 201         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 202             mobj = re.search(pattern, string, flags)
 203         else:
 204             for p in pattern:
 205                 mobj = re.search(p, string, flags)
 206                 if mobj: break
 207
 208         if sys.stderr.isatty() and os.name != 'nt':
 209             _name = u'\033[0;34m%s\033[0m' % name
 210         else:
 211             _name = name
 212
 213         if mobj:
 214             # return the first matching group
 215             return next(g for g in mobj.groups() if g is not None)
 216         elif default is not None:
 217             return default
 218         elif fatal:
 219             raise ExtractorError(u'Unable to extract %s' % _name)
 220         else:
 221             self._downloader.report_warning(u'unable to extract %s; '
 222                 u'please report this issue on GitHub.' % _name)
 223             return None
 224
 225 class SearchInfoExtractor(InfoExtractor):
 226     """
 227     Base class for paged search queries extractors.
 228     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 229     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 230     """
 231
 232     @classmethod
 233     def _make_valid_url(cls):
 234         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 235
 236     @classmethod
 237     def suitable(cls, url):
 238         return re.match(cls._make_valid_url(), url) is not None
 239
 240     def _real_extract(self, query):
 241         mobj = re.match(self._make_valid_url(), query)
 242         if mobj is None:
 243             raise ExtractorError(u'Invalid search query "%s"' % query)
 244
 245         prefix = mobj.group('prefix')
 246         query = mobj.group('query')
 247         if prefix == '':
 248             return self._get_n_results(query, 1)
 249         elif prefix == 'all':
 250             return self._get_n_results(query, self._MAX_RESULTS)
 251         else:
 252             n = int(prefix)
 253             if n <= 0:
 254                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 255             elif n > self._MAX_RESULTS:
 256                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 257                 n = self._MAX_RESULTS
 258             return self._get_n_results(query, n)
 259
 260     def _get_n_results(self, query, n):
 261         """Get a specified number of results for a query"""
 262         raise NotImplementedError("This method must be implemented by sublclasses")
 263
 264
 265 class YoutubeIE(InfoExtractor):
 266     """Information extractor for youtube.com."""
 267
 268     _VALID_URL = r"""^
 269                      (
 270                          (?:https?://)?                                       # http(s):// (optional)
 271                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 272                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 273                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 274                          (?:                                                  # the various things that can precede the ID:
 275                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 276                              |(?:                                             # or the v= param in all its forms
 277                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 278                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 279                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 280                                  v=
 281                              )
 282                          )?                                                   # optional -> youtube.com/xxxx is OK
 283                      )?                                                       # all until now is optional -> you can pass the naked ID
 284                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 285                      (?(1).+)?                                                # if we found the ID, everything can follow
 286                      $"""
 287     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 288     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 289     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 290     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 291     _NETRC_MACHINE = 'youtube'
 292     # Listed in order of quality
 293     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 294     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 295     _video_extensions = {
 296         '13': '3gp',
 297         '17': 'mp4',
 298         '18': 'mp4',
 299         '22': 'mp4',
 300         '37': 'mp4',
 301         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 302         '43': 'webm',
 303         '44': 'webm',
 304         '45': 'webm',
 305         '46': 'webm',
 306     }
 307     _video_dimensions = {
 308         '5': '240x400',
 309         '6': '???',
 310         '13': '???',
 311         '17': '144x176',
 312         '18': '360x640',
 313         '22': '720x1280',
 314         '34': '360x640',
 315         '35': '480x854',
 316         '37': '1080x1920',
 317         '38': '3072x4096',
 318         '43': '360x640',
 319         '44': '480x854',
 320         '45': '720x1280',
 321         '46': '1080x1920',
 322     }
 323     IE_NAME = u'youtube'
 324
 325     @classmethod
 326     def suitable(cls, url):
 327         """Receives a URL and returns True if suitable for this IE."""
 328         if YoutubePlaylistIE.suitable(url): return False
 329         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 330
 331     def report_lang(self):
 332         """Report attempt to set language."""
 333         self.to_screen(u'Setting language')
 334
 335     def report_login(self):
 336         """Report attempt to log in."""
 337         self.to_screen(u'Logging in')
 338
 339     def report_video_webpage_download(self, video_id):
 340         """Report attempt to download video webpage."""
 341         self.to_screen(u'%s: Downloading video webpage' % video_id)
 342
 343     def report_video_info_webpage_download(self, video_id):
 344         """Report attempt to download video info webpage."""
 345         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 346
 347     def report_video_subtitles_download(self, video_id):
 348         """Report attempt to download video info webpage."""
 349         self.to_screen(u'%s: Checking available subtitles' % video_id)
 350
 351     def report_video_subtitles_request(self, video_id, sub_lang, format):
 352         """Report attempt to download video info webpage."""
 353         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 354
 355     def report_video_subtitles_available(self, video_id, sub_lang_list):
 356         """Report available subtitles."""
 357         sub_lang = ",".join(list(sub_lang_list.keys()))
 358         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 359
 360     def report_information_extraction(self, video_id):
 361         """Report attempt to extract video information."""
 362         self.to_screen(u'%s: Extracting video information' % video_id)
 363
 364     def report_unavailable_format(self, video_id, format):
 365         """Report extracted video URL."""
 366         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 367
 368     def report_rtmp_download(self):
 369         """Indicate the download will use the RTMP protocol."""
 370         self.to_screen(u'RTMP download detected')
 371
 372     def _get_available_subtitles(self, video_id):
 373         self.report_video_subtitles_download(video_id)
 374         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 375         try:
 376             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 377         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 378             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 379         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 380         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 381         if not sub_lang_list:
 382             return (u'video doesn\'t have subtitles', None)
 383         return sub_lang_list
 384
 385     def _list_available_subtitles(self, video_id):
 386         sub_lang_list = self._get_available_subtitles(video_id)
 387         self.report_video_subtitles_available(video_id, sub_lang_list)
 388
 389     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 390         """
 391         Return tuple:
 392         (error_message, sub_lang, sub)
 393         """
 394         self.report_video_subtitles_request(video_id, sub_lang, format)
 395         params = compat_urllib_parse.urlencode({
 396             'lang': sub_lang,
 397             'name': sub_name,
 398             'v': video_id,
 399             'fmt': format,
 400         })
 401         url = 'http://www.youtube.com/api/timedtext?' + params
 402         try:
 403             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 404         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 405             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 406         if not sub:
 407             return (u'Did not fetch video subtitles', None, None)
 408         return (None, sub_lang, sub)
 409
 410     def _request_automatic_caption(self, video_id, webpage):
 411         """We need the webpage for getting the captions url, pass it as an
 412            argument to speed up the process."""
 413         sub_lang = self._downloader.params.get('subtitleslang')
 414         sub_format = self._downloader.params.get('subtitlesformat')
 415         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 416         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 417         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 418         if mobj is None:
 419             return [(err_msg, None, None)]
 420         player_config = json.loads(mobj.group(1))
 421         try:
 422             args = player_config[u'args']
 423             caption_url = args[u'ttsurl']
 424             timestamp = args[u'timestamp']
 425             params = compat_urllib_parse.urlencode({
 426                 'lang': 'en',
 427                 'tlang': sub_lang,
 428                 'fmt': sub_format,
 429                 'ts': timestamp,
 430                 'kind': 'asr',
 431             })
 432             subtitles_url = caption_url + '&' + params
 433             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 434             return [(None, sub_lang, sub)]
 435         except KeyError:
 436             return [(err_msg, None, None)]
 437
 438     def _extract_subtitle(self, video_id):
 439         """
 440         Return a list with a tuple:
 441         [(error_message, sub_lang, sub)]
 442         """
 443         sub_lang_list = self._get_available_subtitles(video_id)
 444         sub_format = self._downloader.params.get('subtitlesformat')
 445         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 446             return [(sub_lang_list[0], None, None)]
 447         if self._downloader.params.get('subtitleslang', False):
 448             sub_lang = self._downloader.params.get('subtitleslang')
 449         elif 'en' in sub_lang_list:
 450             sub_lang = 'en'
 451         else:
 452             sub_lang = list(sub_lang_list.keys())[0]
 453         if not sub_lang in sub_lang_list:
 454             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 455
 456         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 457         return [subtitle]
 458
 459     def _extract_all_subtitles(self, video_id):
 460         sub_lang_list = self._get_available_subtitles(video_id)
 461         sub_format = self._downloader.params.get('subtitlesformat')
 462         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 463             return [(sub_lang_list[0], None, None)]
 464         subtitles = []
 465         for sub_lang in sub_lang_list:
 466             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 467             subtitles.append(subtitle)
 468         return subtitles
 469
 470     def _print_formats(self, formats):
 471         print('Available formats:')
 472         for x in formats:
 473             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 474
 475     def _real_initialize(self):
 476         if self._downloader is None:
 477             return
 478
 479         username = None
 480         password = None
 481         downloader_params = self._downloader.params
 482
 483         # Attempt to use provided username and password or .netrc data
 484         if downloader_params.get('username', None) is not None:
 485             username = downloader_params['username']
 486             password = downloader_params['password']
 487         elif downloader_params.get('usenetrc', False):
 488             try:
 489                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 490                 if info is not None:
 491                     username = info[0]
 492                     password = info[2]
 493                 else:
 494                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 495             except (IOError, netrc.NetrcParseError) as err:
 496                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 497                 return
 498
 499         # Set language
 500         request = compat_urllib_request.Request(self._LANG_URL)
 501         try:
 502             self.report_lang()
 503             compat_urllib_request.urlopen(request).read()
 504         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 505             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 506             return
 507
 508         # No authentication to be performed
 509         if username is None:
 510             return
 511
 512         request = compat_urllib_request.Request(self._LOGIN_URL)
 513         try:
 514             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 515         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 516             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 517             return
 518
 519         galx = None
 520         dsh = None
 521         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 522         if match:
 523           galx = match.group(1)
 524
 525         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 526         if match:
 527           dsh = match.group(1)
 528
 529         # Log in
 530         login_form_strs = {
 531                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 532                 u'Email': username,
 533                 u'GALX': galx,
 534                 u'Passwd': password,
 535                 u'PersistentCookie': u'yes',
 536                 u'_utf8': u'霱',
 537                 u'bgresponse': u'js_disabled',
 538                 u'checkConnection': u'',
 539                 u'checkedDomains': u'youtube',
 540                 u'dnConn': u'',
 541                 u'dsh': dsh,
 542                 u'pstMsg': u'0',
 543                 u'rmShown': u'1',
 544                 u'secTok': u'',
 545                 u'signIn': u'Sign in',
 546                 u'timeStmp': u'',
 547                 u'service': u'youtube',
 548                 u'uilel': u'3',
 549                 u'hl': u'en_US',
 550         }
 551         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 552         # chokes on unicode
 553         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 554         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 555         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 556         try:
 557             self.report_login()
 558             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 559             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 560                 self._downloader.report_warning(u'unable to log in: bad username or password')
 561                 return
 562         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 563             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 564             return
 565
 566         # Confirm age
 567         age_form = {
 568                 'next_url':     '/',
 569                 'action_confirm':   'Confirm',
 570                 }
 571         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 572         try:
 573             self.report_age_confirmation()
 574             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 575         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 576             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 577
 578     def _extract_id(self, url):
 579         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 580         if mobj is None:
 581             raise ExtractorError(u'Invalid URL: %s' % url)
 582         video_id = mobj.group(2)
 583         return video_id
 584
 585     def _real_extract(self, url):
 586         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 587         mobj = re.search(self._NEXT_URL_RE, url)
 588         if mobj:
 589             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 590         video_id = self._extract_id(url)
 591
 592         # Get video webpage
 593         self.report_video_webpage_download(video_id)
 594         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 595         request = compat_urllib_request.Request(url)
 596         try:
 597             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 598         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 599             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 600
 601         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 602
 603         # Attempt to extract SWF player URL
 604         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 605         if mobj is not None:
 606             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 607         else:
 608             player_url = None
 609
 610         # Get video info
 611         self.report_video_info_webpage_download(video_id)
 612         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 613             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 614                     % (video_id, el_type))
 615             video_info_webpage = self._download_webpage(video_info_url, video_id,
 616                                     note=False,
 617                                     errnote='unable to download video info webpage')
 618             video_info = compat_parse_qs(video_info_webpage)
 619             if 'token' in video_info:
 620                 break
 621         if 'token' not in video_info:
 622             if 'reason' in video_info:
 623                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 624             else:
 625                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 626
 627         # Check for "rental" videos
 628         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 629             raise ExtractorError(u'"rental" videos not supported')
 630
 631         # Start extracting information
 632         self.report_information_extraction(video_id)
 633
 634         # uploader
 635         if 'author' not in video_info:
 636             raise ExtractorError(u'Unable to extract uploader name')
 637         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 638
 639         # uploader_id
 640         video_uploader_id = None
 641         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 642         if mobj is not None:
 643             video_uploader_id = mobj.group(1)
 644         else:
 645             self._downloader.report_warning(u'unable to extract uploader nickname')
 646
 647         # title
 648         if 'title' not in video_info:
 649             raise ExtractorError(u'Unable to extract video title')
 650         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 651
 652         # thumbnail image
 653         if 'thumbnail_url' not in video_info:
 654             self._downloader.report_warning(u'unable to extract video thumbnail')
 655             video_thumbnail = ''
 656         else:   # don't panic if we can't find it
 657             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 658
 659         # upload date
 660         upload_date = None
 661         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 662         if mobj is not None:
 663             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 664             upload_date = unified_strdate(upload_date)
 665
 666         # description
 667         video_description = get_element_by_id("eow-description", video_webpage)
 668         if video_description:
 669             video_description = clean_html(video_description)
 670         else:
 671             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 672             if fd_mobj:
 673                 video_description = unescapeHTML(fd_mobj.group(1))
 674             else:
 675                 video_description = u''
 676
 677         # subtitles
 678         video_subtitles = None
 679
 680         if self._downloader.params.get('writesubtitles', False):
 681             video_subtitles = self._extract_subtitle(video_id)
 682             if video_subtitles:
 683                 (sub_error, sub_lang, sub) = video_subtitles[0]
 684                 if sub_error:
 685                     # We try with the automatic captions
 686                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 687                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 688                     if sub is not None:
 689                         pass
 690                     else:
 691                         # We report the original error
 692                         self._downloader.report_error(sub_error)
 693
 694         if self._downloader.params.get('allsubtitles', False):
 695             video_subtitles = self._extract_all_subtitles(video_id)
 696             for video_subtitle in video_subtitles:
 697                 (sub_error, sub_lang, sub) = video_subtitle
 698                 if sub_error:
 699                     self._downloader.report_error(sub_error)
 700
 701         if self._downloader.params.get('listsubtitles', False):
 702             sub_lang_list = self._list_available_subtitles(video_id)
 703             return
 704
 705         if 'length_seconds' not in video_info:
 706             self._downloader.report_warning(u'unable to extract video duration')
 707             video_duration = ''
 708         else:
 709             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 710
 711         # token
 712         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 713
 714         # Decide which formats to download
 715         req_format = self._downloader.params.get('format', None)
 716
 717         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 718             self.report_rtmp_download()
 719             video_url_list = [(None, video_info['conn'][0])]
 720         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 721             url_map = {}
 722             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 723                 url_data = compat_parse_qs(url_data_str)
 724                 if 'itag' in url_data and 'url' in url_data:
 725                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 726                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 727                     url_map[url_data['itag'][0]] = url
 728
 729             format_limit = self._downloader.params.get('format_limit', None)
 730             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 731             if format_limit is not None and format_limit in available_formats:
 732                 format_list = available_formats[available_formats.index(format_limit):]
 733             else:
 734                 format_list = available_formats
 735             existing_formats = [x for x in format_list if x in url_map]
 736             if len(existing_formats) == 0:
 737                 raise ExtractorError(u'no known formats available for video')
 738             if self._downloader.params.get('listformats', None):
 739                 self._print_formats(existing_formats)
 740                 return
 741             if req_format is None or req_format == 'best':
 742                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 743             elif req_format == 'worst':
 744                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 745             elif req_format in ('-1', 'all'):
 746                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 747             else:
 748                 # Specific formats. We pick the first in a slash-delimeted sequence.
 749                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 750                 req_formats = req_format.split('/')
 751                 video_url_list = None
 752                 for rf in req_formats:
 753                     if rf in url_map:
 754                         video_url_list = [(rf, url_map[rf])]
 755                         break
 756                 if video_url_list is None:
 757                     raise ExtractorError(u'requested format not available')
 758         else:
 759             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 760
 761         results = []
 762         for format_param, video_real_url in video_url_list:
 763             # Extension
 764             video_extension = self._video_extensions.get(format_param, 'flv')
 765
 766             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 767                                               self._video_dimensions.get(format_param, '???'))
 768
 769             results.append({
 770                 'id':       video_id,
 771                 'url':      video_real_url,
 772                 'uploader': video_uploader,
 773                 'uploader_id': video_uploader_id,
 774                 'upload_date':  upload_date,
 775                 'title':    video_title,
 776                 'ext':      video_extension,
 777                 'format':   video_format,
 778                 'thumbnail':    video_thumbnail,
 779                 'description':  video_description,
 780                 'player_url':   player_url,
 781                 'subtitles':    video_subtitles,
 782                 'duration':     video_duration
 783             })
 784         return results
 785
 786
 787 class MetacafeIE(InfoExtractor):
 788     """Information Extractor for metacafe.com."""
 789
 790     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 791     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 792     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 793     IE_NAME = u'metacafe'
 794
 795     def report_disclaimer(self):
 796         """Report disclaimer retrieval."""
 797         self.to_screen(u'Retrieving disclaimer')
 798
 799     def _real_initialize(self):
 800         # Retrieve disclaimer
 801         request = compat_urllib_request.Request(self._DISCLAIMER)
 802         try:
 803             self.report_disclaimer()
 804             disclaimer = compat_urllib_request.urlopen(request).read()
 805         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 806             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 807
 808         # Confirm age
 809         disclaimer_form = {
 810             'filters': '0',
 811             'submit': "Continue - I'm over 18",
 812             }
 813         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 814         try:
 815             self.report_age_confirmation()
 816             disclaimer = compat_urllib_request.urlopen(request).read()
 817         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 818             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 819
 820     def _real_extract(self, url):
 821         # Extract id and simplified title from URL
 822         mobj = re.match(self._VALID_URL, url)
 823         if mobj is None:
 824             raise ExtractorError(u'Invalid URL: %s' % url)
 825
 826         video_id = mobj.group(1)
 827
 828         # Check if video comes from YouTube
 829         mobj2 = re.match(r'^yt-(.*)$', video_id)
 830         if mobj2 is not None:
 831             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 832
 833         # Retrieve video webpage to extract further information
 834         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 835
 836         # Extract URL, uploader and title from webpage
 837         self.report_extraction(video_id)
 838         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 839         if mobj is not None:
 840             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 841             video_extension = mediaURL[-3:]
 842
 843             # Extract gdaKey if available
 844             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 845             if mobj is None:
 846                 video_url = mediaURL
 847             else:
 848                 gdaKey = mobj.group(1)
 849                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 850         else:
 851             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 852             if mobj is None:
 853                 raise ExtractorError(u'Unable to extract media URL')
 854             vardict = compat_parse_qs(mobj.group(1))
 855             if 'mediaData' not in vardict:
 856                 raise ExtractorError(u'Unable to extract media URL')
 857             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 858             if mobj is None:
 859                 raise ExtractorError(u'Unable to extract media URL')
 860             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 861             video_extension = mediaURL[-3:]
 862             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 863
 864         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 865         if mobj is None:
 866             raise ExtractorError(u'Unable to extract title')
 867         video_title = mobj.group(1).decode('utf-8')
 868
 869         mobj = re.search(r'submitter=(.*?);', webpage)
 870         if mobj is None:
 871             raise ExtractorError(u'Unable to extract uploader nickname')
 872         video_uploader = mobj.group(1)
 873
 874         return [{
 875             'id':       video_id.decode('utf-8'),
 876             'url':      video_url.decode('utf-8'),
 877             'uploader': video_uploader.decode('utf-8'),
 878             'upload_date':  None,
 879             'title':    video_title,
 880             'ext':      video_extension.decode('utf-8'),
 881         }]
 882
 883 class DailymotionIE(InfoExtractor):
 884     """Information Extractor for Dailymotion"""
 885
 886     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 887     IE_NAME = u'dailymotion'
 888
 889     def _real_extract(self, url):
 890         # Extract id and simplified title from URL
 891         mobj = re.match(self._VALID_URL, url)
 892         if mobj is None:
 893             raise ExtractorError(u'Invalid URL: %s' % url)
 894
 895         video_id = mobj.group(1).split('_')[0].split('?')[0]
 896
 897         video_extension = 'mp4'
 898
 899         # Retrieve video webpage to extract further information
 900         request = compat_urllib_request.Request(url)
 901         request.add_header('Cookie', 'family_filter=off')
 902         webpage = self._download_webpage(request, video_id)
 903
 904         # Extract URL, uploader and title from webpage
 905         self.report_extraction(video_id)
 906         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 907         if mobj is None:
 908             raise ExtractorError(u'Unable to extract media URL')
 909         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 910
 911         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 912             if key in flashvars:
 913                 max_quality = key
 914                 self.to_screen(u'Using %s' % key)
 915                 break
 916         else:
 917             raise ExtractorError(u'Unable to extract video URL')
 918
 919         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 920         if mobj is None:
 921             raise ExtractorError(u'Unable to extract video URL')
 922
 923         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 924
 925         # TODO: support choosing qualities
 926
 927         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 928         if mobj is None:
 929             raise ExtractorError(u'Unable to extract title')
 930         video_title = unescapeHTML(mobj.group('title'))
 931
 932         video_uploader = None
 933         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 934         if mobj is None:
 935             # lookin for official user
 936             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 937             if mobj_official is None:
 938                 self._downloader.report_warning(u'unable to extract uploader nickname')
 939             else:
 940                 video_uploader = mobj_official.group(1)
 941         else:
 942             video_uploader = mobj.group(1)
 943
 944         video_upload_date = None
 945         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 946         if mobj is not None:
 947             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 948
 949         return [{
 950             'id':       video_id,
 951             'url':      video_url,
 952             'uploader': video_uploader,
 953             'upload_date':  video_upload_date,
 954             'title':    video_title,
 955             'ext':      video_extension,
 956         }]
 957
 958
 959 class PhotobucketIE(InfoExtractor):
 960     """Information extractor for photobucket.com."""
 961
 962     # TODO: the original _VALID_URL was:
 963     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 964     # Check if it's necessary to keep the old extracion process
 965     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 966     IE_NAME = u'photobucket'
 967
 968     def _real_extract(self, url):
 969         # Extract id from URL
 970         mobj = re.match(self._VALID_URL, url)
 971         if mobj is None:
 972             raise ExtractorError(u'Invalid URL: %s' % url)
 973
 974         video_id = mobj.group('id')
 975
 976         video_extension = mobj.group('ext')
 977
 978         # Retrieve video webpage to extract further information
 979         webpage = self._download_webpage(url, video_id)
 980
 981         # Extract URL, uploader, and title from webpage
 982         self.report_extraction(video_id)
 983         # We try first by looking the javascript code:
 984         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 985         if mobj is not None:
 986             info = json.loads(mobj.group('json'))
 987             return [{
 988                 'id':       video_id,
 989                 'url':      info[u'downloadUrl'],
 990                 'uploader': info[u'username'],
 991                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 992                 'title':    info[u'title'],
 993                 'ext':      video_extension,
 994                 'thumbnail': info[u'thumbUrl'],
 995             }]
 996
 997         # We try looking in other parts of the webpage
 998         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
 999             webpage, u'video URL')
1000
1001         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1002         if mobj is None:
1003             raise ExtractorError(u'Unable to extract title')
1004         video_title = mobj.group(1).decode('utf-8')
1005         video_uploader = mobj.group(2).decode('utf-8')
1006
1007         return [{
1008             'id':       video_id.decode('utf-8'),
1009             'url':      video_url.decode('utf-8'),
1010             'uploader': video_uploader,
1011             'upload_date':  None,
1012             'title':    video_title,
1013             'ext':      video_extension.decode('utf-8'),
1014         }]
1015
1016
1017 class YahooIE(InfoExtractor):
1018     """Information extractor for screen.yahoo.com."""
1019     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1020
1021     def _real_extract(self, url):
1022         mobj = re.match(self._VALID_URL, url)
1023         if mobj is None:
1024             raise ExtractorError(u'Invalid URL: %s' % url)
1025         video_id = mobj.group('id')
1026         webpage = self._download_webpage(url, video_id)
1027         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1028
1029         if m_id is None:
1030             # TODO: Check which url parameters are required
1031             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1032             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1033             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1034                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1035                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1036                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1037                         '''
1038             self.report_extraction(video_id)
1039             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1040             if m_info is None:
1041                 raise ExtractorError(u'Unable to extract video info')
1042             video_title = m_info.group('title')
1043             video_description = m_info.group('description')
1044             video_thumb = m_info.group('thumb')
1045             video_date = m_info.group('date')
1046             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1047
1048             # TODO: Find a way to get mp4 videos
1049             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1050             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1051             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1052             video_url = m_rest.group('url')
1053             video_path = m_rest.group('path')
1054             if m_rest is None:
1055                 raise ExtractorError(u'Unable to extract video url')
1056
1057         else: # We have to use a different method if another id is defined
1058             long_id = m_id.group('new_id')
1059             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1060             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1061             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1062             info = json.loads(json_str)
1063             res = info[u'query'][u'results'][u'mediaObj'][0]
1064             stream = res[u'streams'][0]
1065             video_path = stream[u'path']
1066             video_url = stream[u'host']
1067             meta = res[u'meta']
1068             video_title = meta[u'title']
1069             video_description = meta[u'description']
1070             video_thumb = meta[u'thumbnail']
1071             video_date = None # I can't find it
1072
1073         info_dict = {
1074                      'id': video_id,
1075                      'url': video_url,
1076                      'play_path': video_path,
1077                      'title':video_title,
1078                      'description': video_description,
1079                      'thumbnail': video_thumb,
1080                      'upload_date': video_date,
1081                      'ext': 'flv',
1082                      }
1083         return info_dict
1084
1085 class VimeoIE(InfoExtractor):
1086     """Information extractor for vimeo.com."""
1087
1088     # _VALID_URL matches Vimeo URLs
1089     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1090     IE_NAME = u'vimeo'
1091
1092     def _real_extract(self, url, new_video=True):
1093         # Extract ID from URL
1094         mobj = re.match(self._VALID_URL, url)
1095         if mobj is None:
1096             raise ExtractorError(u'Invalid URL: %s' % url)
1097
1098         video_id = mobj.group('id')
1099         if not mobj.group('proto'):
1100             url = 'https://' + url
1101         if mobj.group('direct_link') or mobj.group('pro'):
1102             url = 'https://vimeo.com/' + video_id
1103
1104         # Retrieve video webpage to extract further information
1105         request = compat_urllib_request.Request(url, None, std_headers)
1106         webpage = self._download_webpage(request, video_id)
1107
1108         # Now we begin extracting as much information as we can from what we
1109         # retrieved. First we extract the information common to all extractors,
1110         # and latter we extract those that are Vimeo specific.
1111         self.report_extraction(video_id)
1112
1113         # Extract the config JSON
1114         try:
1115             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1116             config = json.loads(config)
1117         except:
1118             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1119                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1120             else:
1121                 raise ExtractorError(u'Unable to extract info section')
1122
1123         # Extract title
1124         video_title = config["video"]["title"]
1125
1126         # Extract uploader and uploader_id
1127         video_uploader = config["video"]["owner"]["name"]
1128         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1129
1130         # Extract video thumbnail
1131         video_thumbnail = config["video"]["thumbnail"]
1132
1133         # Extract video description
1134         video_description = get_element_by_attribute("itemprop", "description", webpage)
1135         if video_description: video_description = clean_html(video_description)
1136         else: video_description = u''
1137
1138         # Extract upload date
1139         video_upload_date = None
1140         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1141         if mobj is not None:
1142             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1143
1144         # Vimeo specific: extract request signature and timestamp
1145         sig = config['request']['signature']
1146         timestamp = config['request']['timestamp']
1147
1148         # Vimeo specific: extract video codec and quality information
1149         # First consider quality, then codecs, then take everything
1150         # TODO bind to format param
1151         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1152         files = { 'hd': [], 'sd': [], 'other': []}
1153         for codec_name, codec_extension in codecs:
1154             if codec_name in config["video"]["files"]:
1155                 if 'hd' in config["video"]["files"][codec_name]:
1156                     files['hd'].append((codec_name, codec_extension, 'hd'))
1157                 elif 'sd' in config["video"]["files"][codec_name]:
1158                     files['sd'].append((codec_name, codec_extension, 'sd'))
1159                 else:
1160                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1161
1162         for quality in ('hd', 'sd', 'other'):
1163             if len(files[quality]) > 0:
1164                 video_quality = files[quality][0][2]
1165                 video_codec = files[quality][0][0]
1166                 video_extension = files[quality][0][1]
1167                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1168                 break
1169         else:
1170             raise ExtractorError(u'No known codec found')
1171
1172         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1173                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1174
1175         return [{
1176             'id':       video_id,
1177             'url':      video_url,
1178             'uploader': video_uploader,
1179             'uploader_id': video_uploader_id,
1180             'upload_date':  video_upload_date,
1181             'title':    video_title,
1182             'ext':      video_extension,
1183             'thumbnail':    video_thumbnail,
1184             'description':  video_description,
1185         }]
1186
1187
1188 class ArteTvIE(InfoExtractor):
1189     """arte.tv information extractor."""
1190
1191     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1192     _LIVE_URL = r'index-[0-9]+\.html$'
1193
1194     IE_NAME = u'arte.tv'
1195
1196     def fetch_webpage(self, url):
1197         request = compat_urllib_request.Request(url)
1198         try:
1199             self.report_download_webpage(url)
1200             webpage = compat_urllib_request.urlopen(request).read()
1201         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1202             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1203         except ValueError as err:
1204             raise ExtractorError(u'Invalid URL: %s' % url)
1205         return webpage
1206
1207     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1208         page = self.fetch_webpage(url)
1209         mobj = re.search(regex, page, regexFlags)
1210         info = {}
1211
1212         if mobj is None:
1213             raise ExtractorError(u'Invalid URL: %s' % url)
1214
1215         for (i, key, err) in matchTuples:
1216             if mobj.group(i) is None:
1217                 raise ExtractorError(err)
1218             else:
1219                 info[key] = mobj.group(i)
1220
1221         return info
1222
1223     def extractLiveStream(self, url):
1224         video_lang = url.split('/')[-4]
1225         info = self.grep_webpage(
1226             url,
1227             r'src="(.*?/videothek_js.*?\.js)',
1228             0,
1229             [
1230                 (1, 'url', u'Invalid URL: %s' % url)
1231             ]
1232         )
1233         http_host = url.split('/')[2]
1234         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1235         info = self.grep_webpage(
1236             next_url,
1237             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1238                 '(http://.*?\.swf).*?' +
1239                 '(rtmp://.*?)\'',
1240             re.DOTALL,
1241             [
1242                 (1, 'path',   u'could not extract video path: %s' % url),
1243                 (2, 'player', u'could not extract video player: %s' % url),
1244                 (3, 'url',    u'could not extract video url: %s' % url)
1245             ]
1246         )
1247         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1248
1249     def extractPlus7Stream(self, url):
1250         video_lang = url.split('/')[-3]
1251         info = self.grep_webpage(
1252             url,
1253             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1254             0,
1255             [
1256                 (1, 'url', u'Invalid URL: %s' % url)
1257             ]
1258         )
1259         next_url = compat_urllib_parse.unquote(info.get('url'))
1260         info = self.grep_webpage(
1261             next_url,
1262             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1263             0,
1264             [
1265                 (1, 'url', u'Could not find <video> tag: %s' % url)
1266             ]
1267         )
1268         next_url = compat_urllib_parse.unquote(info.get('url'))
1269
1270         info = self.grep_webpage(
1271             next_url,
1272             r'<video id="(.*?)".*?>.*?' +
1273                 '<name>(.*?)</name>.*?' +
1274                 '<dateVideo>(.*?)</dateVideo>.*?' +
1275                 '<url quality="hd">(.*?)</url>',
1276             re.DOTALL,
1277             [
1278                 (1, 'id',    u'could not extract video id: %s' % url),
1279                 (2, 'title', u'could not extract video title: %s' % url),
1280                 (3, 'date',  u'could not extract video date: %s' % url),
1281                 (4, 'url',   u'could not extract video url: %s' % url)
1282             ]
1283         )
1284
1285         return {
1286             'id':           info.get('id'),
1287             'url':          compat_urllib_parse.unquote(info.get('url')),
1288             'uploader':     u'arte.tv',
1289             'upload_date':  unified_strdate(info.get('date')),
1290             'title':        info.get('title').decode('utf-8'),
1291             'ext':          u'mp4',
1292             'format':       u'NA',
1293             'player_url':   None,
1294         }
1295
1296     def _real_extract(self, url):
1297         video_id = url.split('/')[-1]
1298         self.report_extraction(video_id)
1299
1300         if re.search(self._LIVE_URL, video_id) is not None:
1301             self.extractLiveStream(url)
1302             return
1303         else:
1304             info = self.extractPlus7Stream(url)
1305
1306         return [info]
1307
1308
1309 class GenericIE(InfoExtractor):
1310     """Generic last-resort information extractor."""
1311
1312     _VALID_URL = r'.*'
1313     IE_NAME = u'generic'
1314
1315     def report_download_webpage(self, video_id):
1316         """Report webpage download."""
1317         if not self._downloader.params.get('test', False):
1318             self._downloader.report_warning(u'Falling back on generic information extractor.')
1319         super(GenericIE, self).report_download_webpage(video_id)
1320
1321     def report_following_redirect(self, new_url):
1322         """Report information extraction."""
1323         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1324
1325     def _test_redirect(self, url):
1326         """Check if it is a redirect, like url shorteners, in case return the new url."""
1327         class HeadRequest(compat_urllib_request.Request):
1328             def get_method(self):
1329                 return "HEAD"
1330
1331         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1332             """
1333             Subclass the HTTPRedirectHandler to make it use our
1334             HeadRequest also on the redirected URL
1335             """
1336             def redirect_request(self, req, fp, code, msg, headers, newurl):
1337                 if code in (301, 302, 303, 307):
1338                     newurl = newurl.replace(' ', '%20')
1339                     newheaders = dict((k,v) for k,v in req.headers.items()
1340                                       if k.lower() not in ("content-length", "content-type"))
1341                     return HeadRequest(newurl,
1342                                        headers=newheaders,
1343                                        origin_req_host=req.get_origin_req_host(),
1344                                        unverifiable=True)
1345                 else:
1346                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1347
1348         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1349             """
1350             Fallback to GET if HEAD is not allowed (405 HTTP error)
1351             """
1352             def http_error_405(self, req, fp, code, msg, headers):
1353                 fp.read()
1354                 fp.close()
1355
1356                 newheaders = dict((k,v) for k,v in req.headers.items()
1357                                   if k.lower() not in ("content-length", "content-type"))
1358                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1359                                                  headers=newheaders,
1360                                                  origin_req_host=req.get_origin_req_host(),
1361                                                  unverifiable=True))
1362
1363         # Build our opener
1364         opener = compat_urllib_request.OpenerDirector()
1365         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1366                         HTTPMethodFallback, HEADRedirectHandler,
1367                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1368             opener.add_handler(handler())
1369
1370         response = opener.open(HeadRequest(url))
1371         if response is None:
1372             raise ExtractorError(u'Invalid URL protocol')
1373         new_url = response.geturl()
1374
1375         if url == new_url:
1376             return False
1377
1378         self.report_following_redirect(new_url)
1379         return new_url
1380
1381     def _real_extract(self, url):
1382         new_url = self._test_redirect(url)
1383         if new_url: return [self.url_result(new_url)]
1384
1385         video_id = url.split('/')[-1]
1386         try:
1387             webpage = self._download_webpage(url, video_id)
1388         except ValueError as err:
1389             # since this is the last-resort InfoExtractor, if
1390             # this error is thrown, it'll be thrown here
1391             raise ExtractorError(u'Invalid URL: %s' % url)
1392
1393         self.report_extraction(video_id)
1394         # Start with something easy: JW Player in SWFObject
1395         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1396         if mobj is None:
1397             # Broaden the search a little bit
1398             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1399         if mobj is None:
1400             # Broaden the search a little bit: JWPlayer JS loader
1401             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1402         if mobj is None:
1403             raise ExtractorError(u'Invalid URL: %s' % url)
1404
1405         # It's possible that one of the regexes
1406         # matched, but returned an empty group:
1407         if mobj.group(1) is None:
1408             raise ExtractorError(u'Invalid URL: %s' % url)
1409
1410         video_url = compat_urllib_parse.unquote(mobj.group(1))
1411         video_id = os.path.basename(video_url)
1412
1413         # here's a fun little line of code for you:
1414         video_extension = os.path.splitext(video_id)[1][1:]
1415         video_id = os.path.splitext(video_id)[0]
1416
1417         # it's tempting to parse this further, but you would
1418         # have to take into account all the variations like
1419         #   Video Title - Site Name
1420         #   Site Name | Video Title
1421         #   Video Title - Tagline | Site Name
1422         # and so on and so forth; it's just not practical
1423         mobj = re.search(r'<title>(.*)</title>', webpage)
1424         if mobj is None:
1425             raise ExtractorError(u'Unable to extract title')
1426         video_title = mobj.group(1)
1427
1428         # video uploader is domain name
1429         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1430         if mobj is None:
1431             raise ExtractorError(u'Unable to extract title')
1432         video_uploader = mobj.group(1)
1433
1434         return [{
1435             'id':       video_id,
1436             'url':      video_url,
1437             'uploader': video_uploader,
1438             'upload_date':  None,
1439             'title':    video_title,
1440             'ext':      video_extension,
1441         }]
1442
1443
1444 class YoutubeSearchIE(SearchInfoExtractor):
1445     """Information Extractor for YouTube search queries."""
1446     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1447     _MAX_RESULTS = 1000
1448     IE_NAME = u'youtube:search'
1449     _SEARCH_KEY = 'ytsearch'
1450
1451     def report_download_page(self, query, pagenum):
1452         """Report attempt to download search page with given number."""
1453         query = query.decode(preferredencoding())
1454         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1455
1456     def _get_n_results(self, query, n):
1457         """Get a specified number of results for a query"""
1458
1459         video_ids = []
1460         pagenum = 0
1461         limit = n
1462
1463         while (50 * pagenum) < limit:
1464             self.report_download_page(query, pagenum+1)
1465             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1466             request = compat_urllib_request.Request(result_url)
1467             try:
1468                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1469             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1470                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1471             api_response = json.loads(data)['data']
1472
1473             if not 'items' in api_response:
1474                 raise ExtractorError(u'[youtube] No video results')
1475
1476             new_ids = list(video['id'] for video in api_response['items'])
1477             video_ids += new_ids
1478
1479             limit = min(n, api_response['totalItems'])
1480             pagenum += 1
1481
1482         if len(video_ids) > n:
1483             video_ids = video_ids[:n]
1484         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1485         return self.playlist_result(videos, query)
1486
1487
1488 class GoogleSearchIE(SearchInfoExtractor):
1489     """Information Extractor for Google Video search queries."""
1490     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1491     _MAX_RESULTS = 1000
1492     IE_NAME = u'video.google:search'
1493     _SEARCH_KEY = 'gvsearch'
1494
1495     def _get_n_results(self, query, n):
1496         """Get a specified number of results for a query"""
1497
1498         res = {
1499             '_type': 'playlist',
1500             'id': query,
1501             'entries': []
1502         }
1503
1504         for pagenum in itertools.count(1):
1505             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1506             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1507                                              note='Downloading result page ' + str(pagenum))
1508
1509             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1510                 e = {
1511                     '_type': 'url',
1512                     'url': mobj.group(1)
1513                 }
1514                 res['entries'].append(e)
1515
1516             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1517                 return res
1518
1519 class YahooSearchIE(SearchInfoExtractor):
1520     """Information Extractor for Yahoo! Video search queries."""
1521
1522     _MAX_RESULTS = 1000
1523     IE_NAME = u'screen.yahoo:search'
1524     _SEARCH_KEY = 'yvsearch'
1525
1526     def _get_n_results(self, query, n):
1527         """Get a specified number of results for a query"""
1528
1529         res = {
1530             '_type': 'playlist',
1531             'id': query,
1532             'entries': []
1533         }
1534         for pagenum in itertools.count(0):
1535             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1536             webpage = self._download_webpage(result_url, query,
1537                                              note='Downloading results page '+str(pagenum+1))
1538             info = json.loads(webpage)
1539             m = info[u'm']
1540             results = info[u'results']
1541
1542             for (i, r) in enumerate(results):
1543                 if (pagenum * 30) +i >= n:
1544                     break
1545                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1546                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1547                 res['entries'].append(e)
1548             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1549                 break
1550
1551         return res
1552
1553
1554 class YoutubePlaylistIE(InfoExtractor):
1555     """Information Extractor for YouTube playlists."""
1556
1557     _VALID_URL = r"""(?:
1558                         (?:https?://)?
1559                         (?:\w+\.)?
1560                         youtube\.com/
1561                         (?:
1562                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1563                            \? (?:.*?&)*? (?:p|a|list)=
1564                         |  p/
1565                         )
1566                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1567                         .*
1568                      |
1569                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1570                      )"""
1571     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1572     _MAX_RESULTS = 50
1573     IE_NAME = u'youtube:playlist'
1574
1575     @classmethod
1576     def suitable(cls, url):
1577         """Receives a URL and returns True if suitable for this IE."""
1578         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1579
1580     def _real_extract(self, url):
1581         # Extract playlist id
1582         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1583         if mobj is None:
1584             raise ExtractorError(u'Invalid URL: %s' % url)
1585
1586         # Download playlist videos from API
1587         playlist_id = mobj.group(1) or mobj.group(2)
1588         page_num = 1
1589         videos = []
1590
1591         while True:
1592             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1593             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1594
1595             try:
1596                 response = json.loads(page)
1597             except ValueError as err:
1598                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1599
1600             if 'feed' not in response:
1601                 raise ExtractorError(u'Got a malformed response from YouTube API')
1602             playlist_title = response['feed']['title']['$t']
1603             if 'entry' not in response['feed']:
1604                 # Number of videos is a multiple of self._MAX_RESULTS
1605                 break
1606
1607             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1608                         for entry in response['feed']['entry']
1609                         if 'content' in entry ]
1610
1611             if len(response['feed']['entry']) < self._MAX_RESULTS:
1612                 break
1613             page_num += 1
1614
1615         videos = [v[1] for v in sorted(videos)]
1616
1617         url_results = [self.url_result(url, 'Youtube') for url in videos]
1618         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1619
1620
1621 class YoutubeChannelIE(InfoExtractor):
1622     """Information Extractor for YouTube channels."""
1623
1624     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1625     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1626     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1627     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1628     IE_NAME = u'youtube:channel'
1629
1630     def extract_videos_from_page(self, page):
1631         ids_in_page = []
1632         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1633             if mobj.group(1) not in ids_in_page:
1634                 ids_in_page.append(mobj.group(1))
1635         return ids_in_page
1636
1637     def _real_extract(self, url):
1638         # Extract channel id
1639         mobj = re.match(self._VALID_URL, url)
1640         if mobj is None:
1641             raise ExtractorError(u'Invalid URL: %s' % url)
1642
1643         # Download channel page
1644         channel_id = mobj.group(1)
1645         video_ids = []
1646         pagenum = 1
1647
1648         url = self._TEMPLATE_URL % (channel_id, pagenum)
1649         page = self._download_webpage(url, channel_id,
1650                                       u'Downloading page #%s' % pagenum)
1651
1652         # Extract video identifiers
1653         ids_in_page = self.extract_videos_from_page(page)
1654         video_ids.extend(ids_in_page)
1655
1656         # Download any subsequent channel pages using the json-based channel_ajax query
1657         if self._MORE_PAGES_INDICATOR in page:
1658             while True:
1659                 pagenum = pagenum + 1
1660
1661                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1662                 page = self._download_webpage(url, channel_id,
1663                                               u'Downloading page #%s' % pagenum)
1664
1665                 page = json.loads(page)
1666
1667                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1668                 video_ids.extend(ids_in_page)
1669
1670                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1671                     break
1672
1673         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1674
1675         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1676         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1677         return [self.playlist_result(url_entries, channel_id)]
1678
1679
1680 class YoutubeUserIE(InfoExtractor):
1681     """Information Extractor for YouTube users."""
1682
1683     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1684     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1685     _GDATA_PAGE_SIZE = 50
1686     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1687     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1688     IE_NAME = u'youtube:user'
1689
1690     def _real_extract(self, url):
1691         # Extract username
1692         mobj = re.match(self._VALID_URL, url)
1693         if mobj is None:
1694             raise ExtractorError(u'Invalid URL: %s' % url)
1695
1696         username = mobj.group(1)
1697
1698         # Download video ids using YouTube Data API. Result size per
1699         # query is limited (currently to 50 videos) so we need to query
1700         # page by page until there are no video ids - it means we got
1701         # all of them.
1702
1703         video_ids = []
1704         pagenum = 0
1705
1706         while True:
1707             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1708
1709             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1710             page = self._download_webpage(gdata_url, username,
1711                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1712
1713             # Extract video identifiers
1714             ids_in_page = []
1715
1716             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1717                 if mobj.group(1) not in ids_in_page:
1718                     ids_in_page.append(mobj.group(1))
1719
1720             video_ids.extend(ids_in_page)
1721
1722             # A little optimization - if current page is not
1723             # "full", ie. does not contain PAGE_SIZE video ids then
1724             # we can assume that this page is the last one - there
1725             # are no more ids on further pages - no need to query
1726             # again.
1727
1728             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1729                 break
1730
1731             pagenum += 1
1732
1733         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1734         url_results = [self.url_result(url, 'Youtube') for url in urls]
1735         return [self.playlist_result(url_results, playlist_title = username)]
1736
1737
1738 class BlipTVUserIE(InfoExtractor):
1739     """Information Extractor for blip.tv users."""
1740
1741     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1742     _PAGE_SIZE = 12
1743     IE_NAME = u'blip.tv:user'
1744
1745     def _real_extract(self, url):
1746         # Extract username
1747         mobj = re.match(self._VALID_URL, url)
1748         if mobj is None:
1749             raise ExtractorError(u'Invalid URL: %s' % url)
1750
1751         username = mobj.group(1)
1752
1753         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1754
1755         page = self._download_webpage(url, username, u'Downloading user page')
1756         mobj = re.search(r'data-users-id="([^"]+)"', page)
1757         page_base = page_base % mobj.group(1)
1758
1759
1760         # Download video ids using BlipTV Ajax calls. Result size per
1761         # query is limited (currently to 12 videos) so we need to query
1762         # page by page until there are no video ids - it means we got
1763         # all of them.
1764
1765         video_ids = []
1766         pagenum = 1
1767
1768         while True:
1769             url = page_base + "&page=" + str(pagenum)
1770             page = self._download_webpage(url, username,
1771                                           u'Downloading video ids from page %d' % pagenum)
1772
1773             # Extract video identifiers
1774             ids_in_page = []
1775
1776             for mobj in re.finditer(r'href="/([^"]+)"', page):
1777                 if mobj.group(1) not in ids_in_page:
1778                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1779
1780             video_ids.extend(ids_in_page)
1781
1782             # A little optimization - if current page is not
1783             # "full", ie. does not contain PAGE_SIZE video ids then
1784             # we can assume that this page is the last one - there
1785             # are no more ids on further pages - no need to query
1786             # again.
1787
1788             if len(ids_in_page) < self._PAGE_SIZE:
1789                 break
1790
1791             pagenum += 1
1792
1793         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1794         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1795         return [self.playlist_result(url_entries, playlist_title = username)]
1796
1797
1798 class DepositFilesIE(InfoExtractor):
1799     """Information extractor for depositfiles.com"""
1800
1801     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1802
1803     def _real_extract(self, url):
1804         file_id = url.split('/')[-1]
1805         # Rebuild url in english locale
1806         url = 'http://depositfiles.com/en/files/' + file_id
1807
1808         # Retrieve file webpage with 'Free download' button pressed
1809         free_download_indication = { 'gateway_result' : '1' }
1810         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1811         try:
1812             self.report_download_webpage(file_id)
1813             webpage = compat_urllib_request.urlopen(request).read()
1814         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1815             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1816
1817         # Search for the real file URL
1818         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1819         if (mobj is None) or (mobj.group(1) is None):
1820             # Try to figure out reason of the error.
1821             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1822             if (mobj is not None) and (mobj.group(1) is not None):
1823                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1824                 raise ExtractorError(u'%s' % restriction_message)
1825             else:
1826                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1827
1828         file_url = mobj.group(1)
1829         file_extension = os.path.splitext(file_url)[1][1:]
1830
1831         # Search for file title
1832         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1833
1834         return [{
1835             'id':       file_id.decode('utf-8'),
1836             'url':      file_url.decode('utf-8'),
1837             'uploader': None,
1838             'upload_date':  None,
1839             'title':    file_title,
1840             'ext':      file_extension.decode('utf-8'),
1841         }]
1842
1843
1844 class FacebookIE(InfoExtractor):
1845     """Information Extractor for Facebook"""
1846
1847     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1848     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1849     _NETRC_MACHINE = 'facebook'
1850     IE_NAME = u'facebook'
1851
1852     def report_login(self):
1853         """Report attempt to log in."""
1854         self.to_screen(u'Logging in')
1855
1856     def _real_initialize(self):
1857         if self._downloader is None:
1858             return
1859
1860         useremail = None
1861         password = None
1862         downloader_params = self._downloader.params
1863
1864         # Attempt to use provided username and password or .netrc data
1865         if downloader_params.get('username', None) is not None:
1866             useremail = downloader_params['username']
1867             password = downloader_params['password']
1868         elif downloader_params.get('usenetrc', False):
1869             try:
1870                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1871                 if info is not None:
1872                     useremail = info[0]
1873                     password = info[2]
1874                 else:
1875                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1876             except (IOError, netrc.NetrcParseError) as err:
1877                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1878                 return
1879
1880         if useremail is None:
1881             return
1882
1883         # Log in
1884         login_form = {
1885             'email': useremail,
1886             'pass': password,
1887             'login': 'Log+In'
1888             }
1889         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1890         try:
1891             self.report_login()
1892             login_results = compat_urllib_request.urlopen(request).read()
1893             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1894                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1895                 return
1896         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1897             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1898             return
1899
1900     def _real_extract(self, url):
1901         mobj = re.match(self._VALID_URL, url)
1902         if mobj is None:
1903             raise ExtractorError(u'Invalid URL: %s' % url)
1904         video_id = mobj.group('ID')
1905
1906         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1907         webpage = self._download_webpage(url, video_id)
1908
1909         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1910         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1911         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1912         if not m:
1913             raise ExtractorError(u'Cannot parse data')
1914         data = dict(json.loads(m.group(1)))
1915         params_raw = compat_urllib_parse.unquote(data['params'])
1916         params = json.loads(params_raw)
1917         video_data = params['video_data'][0]
1918         video_url = video_data.get('hd_src')
1919         if not video_url:
1920             video_url = video_data['sd_src']
1921         if not video_url:
1922             raise ExtractorError(u'Cannot find video URL')
1923         video_duration = int(video_data['video_duration'])
1924         thumbnail = video_data['thumbnail_src']
1925
1926         video_title = self._search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1927             webpage, u'title')
1928         video_title = unescapeHTML(video_title)
1929
1930         info = {
1931             'id': video_id,
1932             'title': video_title,
1933             'url': video_url,
1934             'ext': 'mp4',
1935             'duration': video_duration,
1936             'thumbnail': thumbnail,
1937         }
1938         return [info]
1939
1940
1941 class BlipTVIE(InfoExtractor):
1942     """Information extractor for blip.tv"""
1943
1944     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1945     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1946     IE_NAME = u'blip.tv'
1947
1948     def report_direct_download(self, title):
1949         """Report information extraction."""
1950         self.to_screen(u'%s: Direct download detected' % title)
1951
1952     def _real_extract(self, url):
1953         mobj = re.match(self._VALID_URL, url)
1954         if mobj is None:
1955             raise ExtractorError(u'Invalid URL: %s' % url)
1956
1957         # See https://github.com/rg3/youtube-dl/issues/857
1958         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1959         if api_mobj is not None:
1960             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1961         urlp = compat_urllib_parse_urlparse(url)
1962         if urlp.path.startswith('/play/'):
1963             request = compat_urllib_request.Request(url)
1964             response = compat_urllib_request.urlopen(request)
1965             redirecturl = response.geturl()
1966             rurlp = compat_urllib_parse_urlparse(redirecturl)
1967             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1968             url = 'http://blip.tv/a/a-' + file_id
1969             return self._real_extract(url)
1970
1971
1972         if '?' in url:
1973             cchar = '&'
1974         else:
1975             cchar = '?'
1976         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1977         request = compat_urllib_request.Request(json_url)
1978         request.add_header('User-Agent', 'iTunes/10.6.1')
1979         self.report_extraction(mobj.group(1))
1980         info = None
1981         try:
1982             urlh = compat_urllib_request.urlopen(request)
1983             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1984                 basename = url.split('/')[-1]
1985                 title,ext = os.path.splitext(basename)
1986                 title = title.decode('UTF-8')
1987                 ext = ext.replace('.', '')
1988                 self.report_direct_download(title)
1989                 info = {
1990                     'id': title,
1991                     'url': url,
1992                     'uploader': None,
1993                     'upload_date': None,
1994                     'title': title,
1995                     'ext': ext,
1996                     'urlhandle': urlh
1997                 }
1998         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1999             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2000         if info is None: # Regular URL
2001             try:
2002                 json_code_bytes = urlh.read()
2003                 json_code = json_code_bytes.decode('utf-8')
2004             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2005                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2006
2007             try:
2008                 json_data = json.loads(json_code)
2009                 if 'Post' in json_data:
2010                     data = json_data['Post']
2011                 else:
2012                     data = json_data
2013
2014                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2015                 video_url = data['media']['url']
2016                 umobj = re.match(self._URL_EXT, video_url)
2017                 if umobj is None:
2018                     raise ValueError('Can not determine filename extension')
2019                 ext = umobj.group(1)
2020
2021                 info = {
2022                     'id': data['item_id'],
2023                     'url': video_url,
2024                     'uploader': data['display_name'],
2025                     'upload_date': upload_date,
2026                     'title': data['title'],
2027                     'ext': ext,
2028                     'format': data['media']['mimeType'],
2029                     'thumbnail': data['thumbnailUrl'],
2030                     'description': data['description'],
2031                     'player_url': data['embedUrl'],
2032                     'user_agent': 'iTunes/10.6.1',
2033                 }
2034             except (ValueError,KeyError) as err:
2035                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2036
2037         return [info]
2038
2039
2040 class MyVideoIE(InfoExtractor):
2041     """Information Extractor for myvideo.de."""
2042
2043     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2044     IE_NAME = u'myvideo'
2045
2046     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2047     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2048     # https://github.com/rg3/youtube-dl/pull/842
2049     def __rc4crypt(self,data, key):
2050         x = 0
2051         box = list(range(256))
2052         for i in list(range(256)):
2053             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2054             box[i], box[x] = box[x], box[i]
2055         x = 0
2056         y = 0
2057         out = ''
2058         for char in data:
2059             x = (x + 1) % 256
2060             y = (y + box[x]) % 256
2061             box[x], box[y] = box[y], box[x]
2062             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2063         return out
2064
2065     def __md5(self,s):
2066         return hashlib.md5(s).hexdigest().encode()
2067
2068     def _real_extract(self,url):
2069         mobj = re.match(self._VALID_URL, url)
2070         if mobj is None:
2071             raise ExtractorError(u'invalid URL: %s' % url)
2072
2073         video_id = mobj.group(1)
2074
2075         GK = (
2076           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2077           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2078           b'TnpsbA0KTVRkbU1tSTRNdz09'
2079         )
2080
2081         # Get video webpage
2082         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2083         webpage = self._download_webpage(webpage_url, video_id)
2084
2085         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2086         if mobj is not None:
2087             self.report_extraction(video_id)
2088             video_url = mobj.group(1) + '.flv'
2089
2090             video_title = self._search_regex('<title>([^<]+)</title>',
2091                 webpage, u'title')
2092
2093             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2094
2095             return [{
2096                 'id':       video_id,
2097                 'url':      video_url,
2098                 'uploader': None,
2099                 'upload_date':  None,
2100                 'title':    video_title,
2101                 'ext':      u'flv',
2102             }]
2103
2104         # try encxml
2105         mobj = re.search('var flashvars={(.+?)}', webpage)
2106         if mobj is None:
2107             raise ExtractorError(u'Unable to extract video')
2108
2109         params = {}
2110         encxml = ''
2111         sec = mobj.group(1)
2112         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2113             if not a == '_encxml':
2114                 params[a] = b
2115             else:
2116                 encxml = compat_urllib_parse.unquote(b)
2117         if not params.get('domain'):
2118             params['domain'] = 'www.myvideo.de'
2119         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2120         if 'flash_playertype=MTV' in xmldata_url:
2121             self._downloader.report_warning(u'avoiding MTV player')
2122             xmldata_url = (
2123                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2124                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2125             ) % video_id
2126
2127         # get enc data
2128         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2129         enc_data_b = binascii.unhexlify(enc_data)
2130         sk = self.__md5(
2131             base64.b64decode(base64.b64decode(GK)) +
2132             self.__md5(
2133                 str(video_id).encode('utf-8')
2134             )
2135         )
2136         dec_data = self.__rc4crypt(enc_data_b, sk)
2137
2138         # extracting infos
2139         self.report_extraction(video_id)
2140
2141         video_url = None
2142         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2143         if mobj:
2144             video_url = compat_urllib_parse.unquote(mobj.group(1))
2145             if 'myvideo2flash' in video_url:
2146                 self._downloader.report_warning(u'forcing RTMPT ...')
2147                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2148
2149         if not video_url:
2150             # extract non rtmp videos
2151             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2152             if mobj is None:
2153                 raise ExtractorError(u'unable to extract url')
2154             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2155
2156         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2157         video_file = compat_urllib_parse.unquote(video_file)
2158
2159         if not video_file.endswith('f4m'):
2160             ppath, prefix = video_file.split('.')
2161             video_playpath = '%s:%s' % (prefix, ppath)
2162             video_hls_playlist = ''
2163         else:
2164             video_playpath = ''
2165             video_hls_playlist = (
2166                 video_filepath + video_file
2167             ).replace('.f4m', '.m3u8')
2168
2169         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2170         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2171
2172         video_title = self._search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2173             webpage, u'title')
2174
2175         return [{
2176             'id':                 video_id,
2177             'url':                video_url,
2178             'tc_url':             video_url,
2179             'uploader':           None,
2180             'upload_date':        None,
2181             'title':              video_title,
2182             'ext':                u'flv',
2183             'play_path':          video_playpath,
2184             'video_file':         video_file,
2185             'video_hls_playlist': video_hls_playlist,
2186             'player_url':         video_swfobj,
2187         }]
2188
2189
2190 class ComedyCentralIE(InfoExtractor):
2191     """Information extractor for The Daily Show and Colbert Report """
2192
2193     # urls can be abbreviations like :thedailyshow or :colbert
2194     # urls for episodes like:
2195     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2196     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2197     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2198     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2199                       |(https?://)?(www\.)?
2200                           (?P<showname>thedailyshow|colbertnation)\.com/
2201                          (full-episodes/(?P<episode>.*)|
2202                           (?P<clip>
2203                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2204                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2205                      $"""
2206
2207     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2208
2209     _video_extensions = {
2210         '3500': 'mp4',
2211         '2200': 'mp4',
2212         '1700': 'mp4',
2213         '1200': 'mp4',
2214         '750': 'mp4',
2215         '400': 'mp4',
2216     }
2217     _video_dimensions = {
2218         '3500': '1280x720',
2219         '2200': '960x540',
2220         '1700': '768x432',
2221         '1200': '640x360',
2222         '750': '512x288',
2223         '400': '384x216',
2224     }
2225
2226     @classmethod
2227     def suitable(cls, url):
2228         """Receives a URL and returns True if suitable for this IE."""
2229         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2230
2231     def _print_formats(self, formats):
2232         print('Available formats:')
2233         for x in formats:
2234             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2235
2236
2237     def _real_extract(self, url):
2238         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2239         if mobj is None:
2240             raise ExtractorError(u'Invalid URL: %s' % url)
2241
2242         if mobj.group('shortname'):
2243             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2244                 url = u'http://www.thedailyshow.com/full-episodes/'
2245             else:
2246                 url = u'http://www.colbertnation.com/full-episodes/'
2247             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2248             assert mobj is not None
2249
2250         if mobj.group('clip'):
2251             if mobj.group('showname') == 'thedailyshow':
2252                 epTitle = mobj.group('tdstitle')
2253             else:
2254                 epTitle = mobj.group('cntitle')
2255             dlNewest = False
2256         else:
2257             dlNewest = not mobj.group('episode')
2258             if dlNewest:
2259                 epTitle = mobj.group('showname')
2260             else:
2261                 epTitle = mobj.group('episode')
2262
2263         self.report_extraction(epTitle)
2264         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2265         if dlNewest:
2266             url = htmlHandle.geturl()
2267             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2268             if mobj is None:
2269                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2270             if mobj.group('episode') == '':
2271                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2272             epTitle = mobj.group('episode')
2273
2274         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2275
2276         if len(mMovieParams) == 0:
2277             # The Colbert Report embeds the information in a without
2278             # a URL prefix; so extract the alternate reference
2279             # and then add the URL prefix manually.
2280
2281             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2282             if len(altMovieParams) == 0:
2283                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2284             else:
2285                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2286
2287         uri = mMovieParams[0][1]
2288         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2289         indexXml = self._download_webpage(indexUrl, epTitle,
2290                                           u'Downloading show index',
2291                                           u'unable to download episode index')
2292
2293         results = []
2294
2295         idoc = xml.etree.ElementTree.fromstring(indexXml)
2296         itemEls = idoc.findall('.//item')
2297         for partNum,itemEl in enumerate(itemEls):
2298             mediaId = itemEl.findall('./guid')[0].text
2299             shortMediaId = mediaId.split(':')[-1]
2300             showId = mediaId.split(':')[-2].replace('.com', '')
2301             officialTitle = itemEl.findall('./title')[0].text
2302             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2303
2304             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2305                         compat_urllib_parse.urlencode({'uri': mediaId}))
2306             configXml = self._download_webpage(configUrl, epTitle,
2307                                                u'Downloading configuration for %s' % shortMediaId)
2308
2309             cdoc = xml.etree.ElementTree.fromstring(configXml)
2310             turls = []
2311             for rendition in cdoc.findall('.//rendition'):
2312                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2313                 turls.append(finfo)
2314
2315             if len(turls) == 0:
2316                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2317                 continue
2318
2319             if self._downloader.params.get('listformats', None):
2320                 self._print_formats([i[0] for i in turls])
2321                 return
2322
2323             # For now, just pick the highest bitrate
2324             format,rtmp_video_url = turls[-1]
2325
2326             # Get the format arg from the arg stream
2327             req_format = self._downloader.params.get('format', None)
2328
2329             # Select format if we can find one
2330             for f,v in turls:
2331                 if f == req_format:
2332                     format, rtmp_video_url = f, v
2333                     break
2334
2335             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2336             if not m:
2337                 raise ExtractorError(u'Cannot transform RTMP url')
2338             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2339             video_url = base + m.group('finalid')
2340
2341             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2342             info = {
2343                 'id': shortMediaId,
2344                 'url': video_url,
2345                 'uploader': showId,
2346                 'upload_date': officialDate,
2347                 'title': effTitle,
2348                 'ext': 'mp4',
2349                 'format': format,
2350                 'thumbnail': None,
2351                 'description': officialTitle,
2352             }
2353             results.append(info)
2354
2355         return results
2356
2357
2358 class EscapistIE(InfoExtractor):
2359     """Information extractor for The Escapist """
2360
2361     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2362     IE_NAME = u'escapist'
2363
2364     def _real_extract(self, url):
2365         mobj = re.match(self._VALID_URL, url)
2366         if mobj is None:
2367             raise ExtractorError(u'Invalid URL: %s' % url)
2368         showName = mobj.group('showname')
2369         videoId = mobj.group('episode')
2370
2371         self.report_extraction(showName)
2372         webpage = self._download_webpage(url, showName)
2373
2374         videoDesc = self._search_regex('<meta name="description" content="([^"]*)"',
2375             webpage, u'description', fatal=False)
2376         if videoDesc: videoDesc = unescapeHTML(videoDesc)
2377
2378         imgUrl = self._search_regex('<meta property="og:image" content="([^"]*)"',
2379             webpage, u'thumbnail', fatal=False)
2380         if imgUrl: imgUrl = unescapeHTML(imgUrl)
2381
2382         playerUrl = self._search_regex('<meta property="og:video" content="([^"]*)"',
2383             webpage, u'player url')
2384         playerUrl = unescapeHTML(playerUrl)
2385
2386         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2387         configUrl = compat_urllib_parse.unquote(configUrl)
2388
2389         configJSON = self._download_webpage(configUrl, showName,
2390                                             u'Downloading configuration',
2391                                             u'unable to download configuration')
2392
2393         # Technically, it's JavaScript, not JSON
2394         configJSON = configJSON.replace("'", '"')
2395
2396         try:
2397             config = json.loads(configJSON)
2398         except (ValueError,) as err:
2399             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2400
2401         playlist = config['playlist']
2402         videoUrl = playlist[1]['url']
2403
2404         info = {
2405             'id': videoId,
2406             'url': videoUrl,
2407             'uploader': showName,
2408             'upload_date': None,
2409             'title': showName,
2410             'ext': 'mp4',
2411             'thumbnail': imgUrl,
2412             'description': videoDesc,
2413             'player_url': playerUrl,
2414         }
2415
2416         return [info]
2417
2418 class CollegeHumorIE(InfoExtractor):
2419     """Information extractor for collegehumor.com"""
2420
2421     _WORKING = False
2422     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2423     IE_NAME = u'collegehumor'
2424
2425     def report_manifest(self, video_id):
2426         """Report information extraction."""
2427         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2428
2429     def _real_extract(self, url):
2430         mobj = re.match(self._VALID_URL, url)
2431         if mobj is None:
2432             raise ExtractorError(u'Invalid URL: %s' % url)
2433         video_id = mobj.group('videoid')
2434
2435         info = {
2436             'id': video_id,
2437             'uploader': None,
2438             'upload_date': None,
2439         }
2440
2441         self.report_extraction(video_id)
2442         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2443         try:
2444             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2445         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2446             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2447
2448         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2449         try:
2450             videoNode = mdoc.findall('./video')[0]
2451             info['description'] = videoNode.findall('./description')[0].text
2452             info['title'] = videoNode.findall('./caption')[0].text
2453             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2454             manifest_url = videoNode.findall('./file')[0].text
2455         except IndexError:
2456             raise ExtractorError(u'Invalid metadata XML file')
2457
2458         manifest_url += '?hdcore=2.10.3'
2459         self.report_manifest(video_id)
2460         try:
2461             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2462         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2463             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2464
2465         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2466         try:
2467             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2468             node_id = media_node.attrib['url']
2469             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2470         except IndexError as err:
2471             raise ExtractorError(u'Invalid manifest file')
2472
2473         url_pr = compat_urllib_parse_urlparse(manifest_url)
2474         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2475
2476         info['url'] = url
2477         info['ext'] = 'f4f'
2478         return [info]
2479
2480
2481 class XVideosIE(InfoExtractor):
2482     """Information extractor for xvideos.com"""
2483
2484     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2485     IE_NAME = u'xvideos'
2486
2487     def _real_extract(self, url):
2488         mobj = re.match(self._VALID_URL, url)
2489         if mobj is None:
2490             raise ExtractorError(u'Invalid URL: %s' % url)
2491         video_id = mobj.group(1)
2492
2493         webpage = self._download_webpage(url, video_id)
2494
2495         self.report_extraction(video_id)
2496
2497         # Extract video URL
2498         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2499             webpage, u'video URL'))
2500
2501         # Extract title
2502         video_title = self._search_regex(r'<title>(.*?)\s+-\s+XVID',
2503             webpage, u'title')
2504
2505         # Extract video thumbnail
2506         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2507             webpage, u'thumbnail', fatal=False)
2508
2509         info = {
2510             'id': video_id,
2511             'url': video_url,
2512             'uploader': None,
2513             'upload_date': None,
2514             'title': video_title,
2515             'ext': 'flv',
2516             'thumbnail': video_thumbnail,
2517             'description': None,
2518         }
2519
2520         return [info]
2521
2522
2523 class SoundcloudIE(InfoExtractor):
2524     """Information extractor for soundcloud.com
2525        To access the media, the uid of the song and a stream token
2526        must be extracted from the page source and the script must make
2527        a request to media.soundcloud.com/crossdomain.xml. Then
2528        the media can be grabbed by requesting from an url composed
2529        of the stream token and uid
2530      """
2531
2532     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2533     IE_NAME = u'soundcloud'
2534
2535     def report_resolve(self, video_id):
2536         """Report information extraction."""
2537         self.to_screen(u'%s: Resolving id' % video_id)
2538
2539     def _real_extract(self, url):
2540         mobj = re.match(self._VALID_URL, url)
2541         if mobj is None:
2542             raise ExtractorError(u'Invalid URL: %s' % url)
2543
2544         # extract uploader (which is in the url)
2545         uploader = mobj.group(1)
2546         # extract simple title (uploader + slug of song title)
2547         slug_title =  mobj.group(2)
2548         simple_title = uploader + u'-' + slug_title
2549         full_title = '%s/%s' % (uploader, slug_title)
2550
2551         self.report_resolve(full_title)
2552
2553         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2554         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2555         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2556
2557         info = json.loads(info_json)
2558         video_id = info['id']
2559         self.report_extraction(full_title)
2560
2561         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2562         stream_json = self._download_webpage(streams_url, full_title,
2563                                              u'Downloading stream definitions',
2564                                              u'unable to download stream definitions')
2565
2566         streams = json.loads(stream_json)
2567         mediaURL = streams['http_mp3_128_url']
2568         upload_date = unified_strdate(info['created_at'])
2569
2570         return [{
2571             'id':       info['id'],
2572             'url':      mediaURL,
2573             'uploader': info['user']['username'],
2574             'upload_date': upload_date,
2575             'title':    info['title'],
2576             'ext':      u'mp3',
2577             'description': info['description'],
2578         }]
2579
2580 class SoundcloudSetIE(InfoExtractor):
2581     """Information extractor for soundcloud.com sets
2582        To access the media, the uid of the song and a stream token
2583        must be extracted from the page source and the script must make
2584        a request to media.soundcloud.com/crossdomain.xml. Then
2585        the media can be grabbed by requesting from an url composed
2586        of the stream token and uid
2587      """
2588
2589     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2590     IE_NAME = u'soundcloud:set'
2591
2592     def report_resolve(self, video_id):
2593         """Report information extraction."""
2594         self.to_screen(u'%s: Resolving id' % video_id)
2595
2596     def _real_extract(self, url):
2597         mobj = re.match(self._VALID_URL, url)
2598         if mobj is None:
2599             raise ExtractorError(u'Invalid URL: %s' % url)
2600
2601         # extract uploader (which is in the url)
2602         uploader = mobj.group(1)
2603         # extract simple title (uploader + slug of song title)
2604         slug_title =  mobj.group(2)
2605         simple_title = uploader + u'-' + slug_title
2606         full_title = '%s/sets/%s' % (uploader, slug_title)
2607
2608         self.report_resolve(full_title)
2609
2610         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2611         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2612         info_json = self._download_webpage(resolv_url, full_title)
2613
2614         videos = []
2615         info = json.loads(info_json)
2616         if 'errors' in info:
2617             for err in info['errors']:
2618                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2619             return
2620
2621         self.report_extraction(full_title)
2622         for track in info['tracks']:
2623             video_id = track['id']
2624
2625             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2626             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2627
2628             self.report_extraction(video_id)
2629             streams = json.loads(stream_json)
2630             mediaURL = streams['http_mp3_128_url']
2631
2632             videos.append({
2633                 'id':       video_id,
2634                 'url':      mediaURL,
2635                 'uploader': track['user']['username'],
2636                 'upload_date':  unified_strdate(track['created_at']),
2637                 'title':    track['title'],
2638                 'ext':      u'mp3',
2639                 'description': track['description'],
2640             })
2641         return videos
2642
2643
2644 class InfoQIE(InfoExtractor):
2645     """Information extractor for infoq.com"""
2646     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2647
2648     def _real_extract(self, url):
2649         mobj = re.match(self._VALID_URL, url)
2650         if mobj is None:
2651             raise ExtractorError(u'Invalid URL: %s' % url)
2652
2653         webpage = self._download_webpage(url, video_id=url)
2654         self.report_extraction(url)
2655
2656         # Extract video URL
2657         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2658         if mobj is None:
2659             raise ExtractorError(u'Unable to extract video url')
2660         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2661         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2662
2663         # Extract title
2664         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2665             webpage, u'title')
2666
2667         # Extract description
2668         video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2669             webpage, u'description', fatal=False)
2670
2671         video_filename = video_url.split('/')[-1]
2672         video_id, extension = video_filename.split('.')
2673
2674         info = {
2675             'id': video_id,
2676             'url': video_url,
2677             'uploader': None,
2678             'upload_date': None,
2679             'title': video_title,
2680             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2681             'thumbnail': None,
2682             'description': video_description,
2683         }
2684
2685         return [info]
2686
2687 class MixcloudIE(InfoExtractor):
2688     """Information extractor for www.mixcloud.com"""
2689
2690     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2691     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2692     IE_NAME = u'mixcloud'
2693
2694     def report_download_json(self, file_id):
2695         """Report JSON download."""
2696         self.to_screen(u'Downloading json')
2697
2698     def get_urls(self, jsonData, fmt, bitrate='best'):
2699         """Get urls from 'audio_formats' section in json"""
2700         file_url = None
2701         try:
2702             bitrate_list = jsonData[fmt]
2703             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2704                 bitrate = max(bitrate_list) # select highest
2705
2706             url_list = jsonData[fmt][bitrate]
2707         except TypeError: # we have no bitrate info.
2708             url_list = jsonData[fmt]
2709         return url_list
2710
2711     def check_urls(self, url_list):
2712         """Returns 1st active url from list"""
2713         for url in url_list:
2714             try:
2715                 compat_urllib_request.urlopen(url)
2716                 return url
2717             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2718                 url = None
2719
2720         return None
2721
2722     def _print_formats(self, formats):
2723         print('Available formats:')
2724         for fmt in formats.keys():
2725             for b in formats[fmt]:
2726                 try:
2727                     ext = formats[fmt][b][0]
2728                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2729                 except TypeError: # we have no bitrate info
2730                     ext = formats[fmt][0]
2731                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2732                     break
2733
2734     def _real_extract(self, url):
2735         mobj = re.match(self._VALID_URL, url)
2736         if mobj is None:
2737             raise ExtractorError(u'Invalid URL: %s' % url)
2738         # extract uploader & filename from url
2739         uploader = mobj.group(1).decode('utf-8')
2740         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2741
2742         # construct API request
2743         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2744         # retrieve .json file with links to files
2745         request = compat_urllib_request.Request(file_url)
2746         try:
2747             self.report_download_json(file_url)
2748             jsonData = compat_urllib_request.urlopen(request).read()
2749         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2750             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2751
2752         # parse JSON
2753         json_data = json.loads(jsonData)
2754         player_url = json_data['player_swf_url']
2755         formats = dict(json_data['audio_formats'])
2756
2757         req_format = self._downloader.params.get('format', None)
2758         bitrate = None
2759
2760         if self._downloader.params.get('listformats', None):
2761             self._print_formats(formats)
2762             return
2763
2764         if req_format is None or req_format == 'best':
2765             for format_param in formats.keys():
2766                 url_list = self.get_urls(formats, format_param)
2767                 # check urls
2768                 file_url = self.check_urls(url_list)
2769                 if file_url is not None:
2770                     break # got it!
2771         else:
2772             if req_format not in formats:
2773                 raise ExtractorError(u'Format is not available')
2774
2775             url_list = self.get_urls(formats, req_format)
2776             file_url = self.check_urls(url_list)
2777             format_param = req_format
2778
2779         return [{
2780             'id': file_id.decode('utf-8'),
2781             'url': file_url.decode('utf-8'),
2782             'uploader': uploader.decode('utf-8'),
2783             'upload_date': None,
2784             'title': json_data['name'],
2785             'ext': file_url.split('.')[-1].decode('utf-8'),
2786             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2787             'thumbnail': json_data['thumbnail_url'],
2788             'description': json_data['description'],
2789             'player_url': player_url.decode('utf-8'),
2790         }]
2791
2792 class StanfordOpenClassroomIE(InfoExtractor):
2793     """Information extractor for Stanford's Open ClassRoom"""
2794
2795     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2796     IE_NAME = u'stanfordoc'
2797
2798     def _real_extract(self, url):
2799         mobj = re.match(self._VALID_URL, url)
2800         if mobj is None:
2801             raise ExtractorError(u'Invalid URL: %s' % url)
2802
2803         if mobj.group('course') and mobj.group('video'): # A specific video
2804             course = mobj.group('course')
2805             video = mobj.group('video')
2806             info = {
2807                 'id': course + '_' + video,
2808                 'uploader': None,
2809                 'upload_date': None,
2810             }
2811
2812             self.report_extraction(info['id'])
2813             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2814             xmlUrl = baseUrl + video + '.xml'
2815             try:
2816                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2817             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2818                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2819             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2820             try:
2821                 info['title'] = mdoc.findall('./title')[0].text
2822                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2823             except IndexError:
2824                 raise ExtractorError(u'Invalid metadata XML file')
2825             info['ext'] = info['url'].rpartition('.')[2]
2826             return [info]
2827         elif mobj.group('course'): # A course page
2828             course = mobj.group('course')
2829             info = {
2830                 'id': course,
2831                 'type': 'playlist',
2832                 'uploader': None,
2833                 'upload_date': None,
2834             }
2835
2836             coursepage = self._download_webpage(url, info['id'],
2837                                         note='Downloading course info page',
2838                                         errnote='Unable to download course info page')
2839
2840             info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2841             info['title'] = unescapeHTML(info['title'])
2842
2843             info['description'] = self._search_regex('<description>([^<]+)</description>',
2844                 coursepage, u'description', fatal=False)
2845             if info['description']: info['description'] = unescapeHTML(info['description'])
2846
2847             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2848             info['list'] = [
2849                 {
2850                     'type': 'reference',
2851                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2852                 }
2853                     for vpage in links]
2854             results = []
2855             for entry in info['list']:
2856                 assert entry['type'] == 'reference'
2857                 results += self.extract(entry['url'])
2858             return results
2859         else: # Root page
2860             info = {
2861                 'id': 'Stanford OpenClassroom',
2862                 'type': 'playlist',
2863                 'uploader': None,
2864                 'upload_date': None,
2865             }
2866
2867             self.report_download_webpage(info['id'])
2868             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2869             try:
2870                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2871             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2872                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2873
2874             info['title'] = info['id']
2875
2876             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2877             info['list'] = [
2878                 {
2879                     'type': 'reference',
2880                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2881                 }
2882                     for cpage in links]
2883
2884             results = []
2885             for entry in info['list']:
2886                 assert entry['type'] == 'reference'
2887                 results += self.extract(entry['url'])
2888             return results
2889
2890 class MTVIE(InfoExtractor):
2891     """Information extractor for MTV.com"""
2892
2893     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2894     IE_NAME = u'mtv'
2895
2896     def _real_extract(self, url):
2897         mobj = re.match(self._VALID_URL, url)
2898         if mobj is None:
2899             raise ExtractorError(u'Invalid URL: %s' % url)
2900         if not mobj.group('proto'):
2901             url = 'http://' + url
2902         video_id = mobj.group('videoid')
2903
2904         webpage = self._download_webpage(url, video_id)
2905
2906         song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2907             webpage, u'song name', fatal=False)
2908         if song_name: song_name = unescapeHTML(song_name)
2909
2910         video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2911             webpage, u'title')
2912         video_title = unescapeHTML(video_title)
2913
2914         mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2915             webpage, u'mtvn_uri', fatal=False)
2916
2917         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2918             webpage, u'content id', fatal=False)
2919
2920         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2921         self.report_extraction(video_id)
2922         request = compat_urllib_request.Request(videogen_url)
2923         try:
2924             metadataXml = compat_urllib_request.urlopen(request).read()
2925         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2926             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2927
2928         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2929         renditions = mdoc.findall('.//rendition')
2930
2931         # For now, always pick the highest quality.
2932         rendition = renditions[-1]
2933
2934         try:
2935             _,_,ext = rendition.attrib['type'].partition('/')
2936             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2937             video_url = rendition.find('./src').text
2938         except KeyError:
2939             raise ExtractorError('Invalid rendition field.')
2940
2941         info = {
2942             'id': video_id,
2943             'url': video_url,
2944             'uploader': performer,
2945             'upload_date': None,
2946             'title': video_title,
2947             'ext': ext,
2948             'format': format,
2949         }
2950
2951         return [info]
2952
2953
2954 class YoukuIE(InfoExtractor):
2955     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2956
2957     def _gen_sid(self):
2958         nowTime = int(time.time() * 1000)
2959         random1 = random.randint(1000,1998)
2960         random2 = random.randint(1000,9999)
2961
2962         return "%d%d%d" %(nowTime,random1,random2)
2963
2964     def _get_file_ID_mix_string(self, seed):
2965         mixed = []
2966         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2967         seed = float(seed)
2968         for i in range(len(source)):
2969             seed  =  (seed * 211 + 30031 ) % 65536
2970             index  =  math.floor(seed / 65536 * len(source) )
2971             mixed.append(source[int(index)])
2972             source.remove(source[int(index)])
2973         #return ''.join(mixed)
2974         return mixed
2975
2976     def _get_file_id(self, fileId, seed):
2977         mixed = self._get_file_ID_mix_string(seed)
2978         ids = fileId.split('*')
2979         realId = []
2980         for ch in ids:
2981             if ch:
2982                 realId.append(mixed[int(ch)])
2983         return ''.join(realId)
2984
2985     def _real_extract(self, url):
2986         mobj = re.match(self._VALID_URL, url)
2987         if mobj is None:
2988             raise ExtractorError(u'Invalid URL: %s' % url)
2989         video_id = mobj.group('ID')
2990
2991         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2992
2993         jsondata = self._download_webpage(info_url, video_id)
2994
2995         self.report_extraction(video_id)
2996         try:
2997             config = json.loads(jsondata)
2998
2999             video_title =  config['data'][0]['title']
3000             seed = config['data'][0]['seed']
3001
3002             format = self._downloader.params.get('format', None)
3003             supported_format = list(config['data'][0]['streamfileids'].keys())
3004
3005             if format is None or format == 'best':
3006                 if 'hd2' in supported_format:
3007                     format = 'hd2'
3008                 else:
3009                     format = 'flv'
3010                 ext = u'flv'
3011             elif format == 'worst':
3012                 format = 'mp4'
3013                 ext = u'mp4'
3014             else:
3015                 format = 'flv'
3016                 ext = u'flv'
3017
3018
3019             fileid = config['data'][0]['streamfileids'][format]
3020             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3021         except (UnicodeDecodeError, ValueError, KeyError):
3022             raise ExtractorError(u'Unable to extract info section')
3023
3024         files_info=[]
3025         sid = self._gen_sid()
3026         fileid = self._get_file_id(fileid, seed)
3027
3028         #column 8,9 of fileid represent the segment number
3029         #fileid[7:9] should be changed
3030         for index, key in enumerate(keys):
3031
3032             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3033             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3034
3035             info = {
3036                 'id': '%s_part%02d' % (video_id, index),
3037                 'url': download_url,
3038                 'uploader': None,
3039                 'upload_date': None,
3040                 'title': video_title,
3041                 'ext': ext,
3042             }
3043             files_info.append(info)
3044
3045         return files_info
3046
3047
3048 class XNXXIE(InfoExtractor):
3049     """Information extractor for xnxx.com"""
3050
3051     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3052     IE_NAME = u'xnxx'
3053     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3054     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3055     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3056
3057     def _real_extract(self, url):
3058         mobj = re.match(self._VALID_URL, url)
3059         if mobj is None:
3060             raise ExtractorError(u'Invalid URL: %s' % url)
3061         video_id = mobj.group(1)
3062
3063         # Get webpage content
3064         webpage = self._download_webpage(url, video_id)
3065
3066         video_url = self._search_regex(self.VIDEO_URL_RE,
3067             webpage, u'video URL')
3068         video_url = compat_urllib_parse.unquote(video_url)
3069
3070         video_title = self._search_regex(self.VIDEO_TITLE_RE,
3071             webpage, u'title')
3072
3073         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3074             webpage, u'thumbnail', fatal=False)
3075
3076         return [{
3077             'id': video_id,
3078             'url': video_url,
3079             'uploader': None,
3080             'upload_date': None,
3081             'title': video_title,
3082             'ext': 'flv',
3083             'thumbnail': video_thumbnail,
3084             'description': None,
3085         }]
3086
3087
3088 class GooglePlusIE(InfoExtractor):
3089     """Information extractor for plus.google.com."""
3090
3091     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3092     IE_NAME = u'plus.google'
3093
3094     def _real_extract(self, url):
3095         # Extract id from URL
3096         mobj = re.match(self._VALID_URL, url)
3097         if mobj is None:
3098             raise ExtractorError(u'Invalid URL: %s' % url)
3099
3100         post_url = mobj.group(0)
3101         video_id = mobj.group(1)
3102
3103         video_extension = 'flv'
3104
3105         # Step 1, Retrieve post webpage to extract further information
3106         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3107
3108         self.report_extraction(video_id)
3109
3110         # Extract update date
3111         upload_date = self._search_regex('title="Timestamp">(.*?)</a>',
3112             webpage, u'upload date', fatal=False)
3113         if upload_date:
3114             # Convert timestring to a format suitable for filename
3115             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3116             upload_date = upload_date.strftime('%Y%m%d')
3117
3118         # Extract uploader
3119         uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>',
3120             webpage, u'uploader', fatal=False)
3121
3122         # Extract title
3123         # Get the first line for title
3124         video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3125             webpage, 'title', default=u'NA')
3126
3127         # Step 2, Stimulate clicking the image box to launch video
3128         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3129             webpage, u'video page URL')
3130         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3131
3132         # Extract video links on video page
3133         """Extract video links of all sizes"""
3134         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3135         mobj = re.findall(pattern, webpage)
3136         if len(mobj) == 0:
3137             raise ExtractorError(u'Unable to extract video links')
3138
3139         # Sort in resolution
3140         links = sorted(mobj)
3141
3142         # Choose the lowest of the sort, i.e. highest resolution
3143         video_url = links[-1]
3144         # Only get the url. The resolution part in the tuple has no use anymore
3145         video_url = video_url[-1]
3146         # Treat escaped \u0026 style hex
3147         try:
3148             video_url = video_url.decode("unicode_escape")
3149         except AttributeError: # Python 3
3150             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3151
3152
3153         return [{
3154             'id':       video_id,
3155             'url':      video_url,
3156             'uploader': uploader,
3157             'upload_date':  upload_date,
3158             'title':    video_title,
3159             'ext':      video_extension,
3160         }]
3161
3162 class NBAIE(InfoExtractor):
3163     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3164     IE_NAME = u'nba'
3165
3166     def _real_extract(self, url):
3167         mobj = re.match(self._VALID_URL, url)
3168         if mobj is None:
3169             raise ExtractorError(u'Invalid URL: %s' % url)
3170
3171         video_id = mobj.group(1)
3172
3173         webpage = self._download_webpage(url, video_id)
3174
3175         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3176
3177         shortened_video_id = video_id.rpartition('/')[2]
3178         title = self._search_regex(r'<meta property="og:title" content="(.*?)"',
3179             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3180
3181         # It isn't there in the HTML it returns to us
3182         # uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3183
3184         description = self._search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3185
3186         info = {
3187             'id': shortened_video_id,
3188             'url': video_url,
3189             'ext': 'mp4',
3190             'title': title,
3191             # 'uploader_date': uploader_date,
3192             'description': description,
3193         }
3194         return [info]
3195
3196 class JustinTVIE(InfoExtractor):
3197     """Information extractor for justin.tv and twitch.tv"""
3198     # TODO: One broadcast may be split into multiple videos. The key
3199     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3200     # starts at 1 and increases. Can we treat all parts as one video?
3201
3202     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3203         (?:
3204             (?P<channelid>[^/]+)|
3205             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3206             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3207         )
3208         /?(?:\#.*)?$
3209         """
3210     _JUSTIN_PAGE_LIMIT = 100
3211     IE_NAME = u'justin.tv'
3212
3213     def report_download_page(self, channel, offset):
3214         """Report attempt to download a single page of videos."""
3215         self.to_screen(u'%s: Downloading video information from %d to %d' %
3216                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3217
3218     # Return count of items, list of *valid* items
3219     def _parse_page(self, url, video_id):
3220         webpage = self._download_webpage(url, video_id,
3221                                          u'Downloading video info JSON',
3222                                          u'unable to download video info JSON')
3223
3224         response = json.loads(webpage)
3225         if type(response) != list:
3226             error_text = response.get('error', 'unknown error')
3227             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3228         info = []
3229         for clip in response:
3230             video_url = clip['video_file_url']
3231             if video_url:
3232                 video_extension = os.path.splitext(video_url)[1][1:]
3233                 video_date = re.sub('-', '', clip['start_time'][:10])
3234                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3235                 video_id = clip['id']
3236                 video_title = clip.get('title', video_id)
3237                 info.append({
3238                     'id': video_id,
3239                     'url': video_url,
3240                     'title': video_title,
3241                     'uploader': clip.get('channel_name', video_uploader_id),
3242                     'uploader_id': video_uploader_id,
3243                     'upload_date': video_date,
3244                     'ext': video_extension,
3245                 })
3246         return (len(response), info)
3247
3248     def _real_extract(self, url):
3249         mobj = re.match(self._VALID_URL, url)
3250         if mobj is None:
3251             raise ExtractorError(u'invalid URL: %s' % url)
3252
3253         api_base = 'http://api.justin.tv'
3254         paged = False
3255         if mobj.group('channelid'):
3256             paged = True
3257             video_id = mobj.group('channelid')
3258             api = api_base + '/channel/archives/%s.json' % video_id
3259         elif mobj.group('chapterid'):
3260             chapter_id = mobj.group('chapterid')
3261
3262             webpage = self._download_webpage(url, chapter_id)
3263             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3264             if not m:
3265                 raise ExtractorError(u'Cannot find archive of a chapter')
3266             archive_id = m.group(1)
3267
3268             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3269             chapter_info_xml = self._download_webpage(api, chapter_id,
3270                                              note=u'Downloading chapter information',
3271                                              errnote=u'Chapter information download failed')
3272             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3273             for a in doc.findall('.//archive'):
3274                 if archive_id == a.find('./id').text:
3275                     break
3276             else:
3277                 raise ExtractorError(u'Could not find chapter in chapter information')
3278
3279             video_url = a.find('./video_file_url').text
3280             video_ext = video_url.rpartition('.')[2] or u'flv'
3281
3282             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3283             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3284                                    note='Downloading chapter metadata',
3285                                    errnote='Download of chapter metadata failed')
3286             chapter_info = json.loads(chapter_info_json)
3287
3288             bracket_start = int(doc.find('.//bracket_start').text)
3289             bracket_end = int(doc.find('.//bracket_end').text)
3290
3291             # TODO determine start (and probably fix up file)
3292             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3293             #video_url += u'?start=' + TODO:start_timestamp
3294             # bracket_start is 13290, but we want 51670615
3295             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3296                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3297
3298             info = {
3299                 'id': u'c' + chapter_id,
3300                 'url': video_url,
3301                 'ext': video_ext,
3302                 'title': chapter_info['title'],
3303                 'thumbnail': chapter_info['preview'],
3304                 'description': chapter_info['description'],
3305                 'uploader': chapter_info['channel']['display_name'],
3306                 'uploader_id': chapter_info['channel']['name'],
3307             }
3308             return [info]
3309         else:
3310             video_id = mobj.group('videoid')
3311             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3312
3313         self.report_extraction(video_id)
3314
3315         info = []
3316         offset = 0
3317         limit = self._JUSTIN_PAGE_LIMIT
3318         while True:
3319             if paged:
3320                 self.report_download_page(video_id, offset)
3321             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3322             page_count, page_info = self._parse_page(page_url, video_id)
3323             info.extend(page_info)
3324             if not paged or page_count != limit:
3325                 break
3326             offset += limit
3327         return info
3328
3329 class FunnyOrDieIE(InfoExtractor):
3330     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3331
3332     def _real_extract(self, url):
3333         mobj = re.match(self._VALID_URL, url)
3334         if mobj is None:
3335             raise ExtractorError(u'invalid URL: %s' % url)
3336
3337         video_id = mobj.group('id')
3338         webpage = self._download_webpage(url, video_id)
3339
3340         video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3341             webpage, u'video URL', flags=re.DOTALL)
3342         video_url = unescapeHTML(video_url)
3343
3344         title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3345             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3346         title = clean_html(title)
3347
3348         video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3349             webpage, u'description', fatal=False, flags=re.DOTALL)
3350         if video_description: video_description = unescapeHTML(video_description)
3351
3352         info = {
3353             'id': video_id,
3354             'url': video_url,
3355             'ext': 'mp4',
3356             'title': title,
3357             'description': video_description,
3358         }
3359         return [info]
3360
3361 class SteamIE(InfoExtractor):
3362     _VALID_URL = r"""http://store\.steampowered\.com/
3363                 (agecheck/)?
3364                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3365                 (?P<gameID>\d+)/?
3366                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3367                 """
3368
3369     @classmethod
3370     def suitable(cls, url):
3371         """Receives a URL and returns True if suitable for this IE."""
3372         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3373
3374     def _real_extract(self, url):
3375         m = re.match(self._VALID_URL, url, re.VERBOSE)
3376         gameID = m.group('gameID')
3377         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3378         self.report_age_confirmation()
3379         webpage = self._download_webpage(videourl, gameID)
3380         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3381
3382         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3383         mweb = re.finditer(urlRE, webpage)
3384         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3385         titles = re.finditer(namesRE, webpage)
3386         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3387         thumbs = re.finditer(thumbsRE, webpage)
3388         videos = []
3389         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3390             video_id = vid.group('videoID')
3391             title = vtitle.group('videoName')
3392             video_url = vid.group('videoURL')
3393             video_thumb = thumb.group('thumbnail')
3394             if not video_url:
3395                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3396             info = {
3397                 'id':video_id,
3398                 'url':video_url,
3399                 'ext': 'flv',
3400                 'title': unescapeHTML(title),
3401                 'thumbnail': video_thumb
3402                   }
3403             videos.append(info)
3404         return [self.playlist_result(videos, gameID, game_title)]
3405
3406 class UstreamIE(InfoExtractor):
3407     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3408     IE_NAME = u'ustream'
3409
3410     def _real_extract(self, url):
3411         m = re.match(self._VALID_URL, url)
3412         video_id = m.group('videoID')
3413
3414         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3415         webpage = self._download_webpage(url, video_id)
3416
3417         self.report_extraction(video_id)
3418
3419         video_title = self._search_regex(r'data-title="(?P<title>.+)"',
3420             webpage, u'title')
3421
3422         uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3423             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3424         if uploader: uploader = unescapeHTML(uploader.strip())
3425
3426         thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3427             webpage, u'thumbnail', fatal=False)
3428
3429         info = {
3430                 'id': video_id,
3431                 'url': video_url,
3432                 'ext': 'flv',
3433                 'title': video_title,
3434                 'uploader': uploader,
3435                 'thumbnail': thumbnail,
3436                }
3437         return info
3438
3439 class WorldStarHipHopIE(InfoExtractor):
3440     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3441     IE_NAME = u'WorldStarHipHop'
3442
3443     def _real_extract(self, url):
3444         m = re.match(self._VALID_URL, url)
3445         video_id = m.group('id')
3446
3447         webpage_src = self._download_webpage(url, video_id)
3448
3449         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3450             webpage_src, u'video URL')
3451
3452         if 'mp4' in video_url:
3453             ext = 'mp4'
3454         else:
3455             ext = 'flv'
3456
3457         video_title = self._search_regex(r"<title>(.*)</title>",
3458             webpage_src, u'title')
3459
3460         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3461         thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />',
3462             webpage_src, u'thumbnail', fatal=False)
3463
3464         if not thumbnail:
3465             _title = r"""candytitles.*>(.*)</span>"""
3466             mobj = re.search(_title, webpage_src)
3467             if mobj is not None:
3468                 video_title = mobj.group(1)
3469
3470         results = [{
3471                     'id': video_id,
3472                     'url' : video_url,
3473                     'title' : video_title,
3474                     'thumbnail' : thumbnail,
3475                     'ext' : ext,
3476                     }]
3477         return results
3478
3479 class RBMARadioIE(InfoExtractor):
3480     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3481
3482     def _real_extract(self, url):
3483         m = re.match(self._VALID_URL, url)
3484         video_id = m.group('videoID')
3485
3486         webpage = self._download_webpage(url, video_id)
3487
3488         json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
3489             webpage, u'json data')
3490
3491         try:
3492             data = json.loads(json_data)
3493         except ValueError as e:
3494             raise ExtractorError(u'Invalid JSON: ' + str(e))
3495
3496         video_url = data['akamai_url'] + '&cbr=256'
3497         url_parts = compat_urllib_parse_urlparse(video_url)
3498         video_ext = url_parts.path.rpartition('.')[2]
3499         info = {
3500                 'id': video_id,
3501                 'url': video_url,
3502                 'ext': video_ext,
3503                 'title': data['title'],
3504                 'description': data.get('teaser_text'),
3505                 'location': data.get('country_of_origin'),
3506                 'uploader': data.get('host', {}).get('name'),
3507                 'uploader_id': data.get('host', {}).get('slug'),
3508                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3509                 'duration': data.get('duration'),
3510         }
3511         return [info]
3512
3513
3514 class YouPornIE(InfoExtractor):
3515     """Information extractor for youporn.com."""
3516     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3517
3518     def _print_formats(self, formats):
3519         """Print all available formats"""
3520         print(u'Available formats:')
3521         print(u'ext\t\tformat')
3522         print(u'---------------------------------')
3523         for format in formats:
3524             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3525
3526     def _specific(self, req_format, formats):
3527         for x in formats:
3528             if(x["format"]==req_format):
3529                 return x
3530         return None
3531
3532     def _real_extract(self, url):
3533         mobj = re.match(self._VALID_URL, url)
3534         if mobj is None:
3535             raise ExtractorError(u'Invalid URL: %s' % url)
3536         video_id = mobj.group('videoid')
3537
3538         req = compat_urllib_request.Request(url)
3539         req.add_header('Cookie', 'age_verified=1')
3540         webpage = self._download_webpage(req, video_id)
3541
3542         # Get JSON parameters
3543         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3544         try:
3545             params = json.loads(json_params)
3546         except:
3547             raise ExtractorError(u'Invalid JSON')
3548
3549         self.report_extraction(video_id)
3550         try:
3551             video_title = params['title']
3552             upload_date = unified_strdate(params['release_date_f'])
3553             video_description = params['description']
3554             video_uploader = params['submitted_by']
3555             thumbnail = params['thumbnails'][0]['image']
3556         except KeyError:
3557             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3558
3559         # Get all of the formats available
3560         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3561         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3562             webpage, u'download list').strip()
3563
3564         # Get all of the links from the page
3565         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3566         links = re.findall(LINK_RE, download_list_html)
3567         if(len(links) == 0):
3568             raise ExtractorError(u'ERROR: no known formats available for video')
3569
3570         self.to_screen(u'Links found: %d' % len(links))
3571
3572         formats = []
3573         for link in links:
3574
3575             # A link looks like this:
3576             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3577             # A path looks like this:
3578             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3579             video_url = unescapeHTML( link )
3580             path = compat_urllib_parse_urlparse( video_url ).path
3581             extension = os.path.splitext( path )[1][1:]
3582             format = path.split('/')[4].split('_')[:2]
3583             size = format[0]
3584             bitrate = format[1]
3585             format = "-".join( format )
3586             title = u'%s-%s-%s' % (video_title, size, bitrate)
3587
3588             formats.append({
3589                 'id': video_id,
3590                 'url': video_url,
3591                 'uploader': video_uploader,
3592                 'upload_date': upload_date,
3593                 'title': title,
3594                 'ext': extension,
3595                 'format': format,
3596                 'thumbnail': thumbnail,
3597                 'description': video_description
3598             })
3599
3600         if self._downloader.params.get('listformats', None):
3601             self._print_formats(formats)
3602             return
3603
3604         req_format = self._downloader.params.get('format', None)
3605         self.to_screen(u'Format: %s' % req_format)
3606
3607         if req_format is None or req_format == 'best':
3608             return [formats[0]]
3609         elif req_format == 'worst':
3610             return [formats[-1]]
3611         elif req_format in ('-1', 'all'):
3612             return formats
3613         else:
3614             format = self._specific( req_format, formats )
3615             if result is None:
3616                 raise ExtractorError(u'Requested format not available')
3617             return [format]
3618
3619
3620
3621 class PornotubeIE(InfoExtractor):
3622     """Information extractor for pornotube.com."""
3623     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3624
3625     def _real_extract(self, url):
3626         mobj = re.match(self._VALID_URL, url)
3627         if mobj is None:
3628             raise ExtractorError(u'Invalid URL: %s' % url)
3629
3630         video_id = mobj.group('videoid')
3631         video_title = mobj.group('title')
3632
3633         # Get webpage content
3634         webpage = self._download_webpage(url, video_id)
3635
3636         # Get the video URL
3637         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3638         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3639         video_url = compat_urllib_parse.unquote(video_url)
3640
3641         #Get the uploaded date
3642         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3643         upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3644         if upload_date: upload_date = unified_strdate(upload_date)
3645
3646         info = {'id': video_id,
3647                 'url': video_url,
3648                 'uploader': None,
3649                 'upload_date': upload_date,
3650                 'title': video_title,
3651                 'ext': 'flv',
3652                 'format': 'flv'}
3653
3654         return [info]
3655
3656 class YouJizzIE(InfoExtractor):
3657     """Information extractor for youjizz.com."""
3658     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3659
3660     def _real_extract(self, url):
3661         mobj = re.match(self._VALID_URL, url)
3662         if mobj is None:
3663             raise ExtractorError(u'Invalid URL: %s' % url)
3664
3665         video_id = mobj.group('videoid')
3666
3667         # Get webpage content
3668         webpage = self._download_webpage(url, video_id)
3669
3670         # Get the video title
3671         video_title = self._search_regex(r'<title>(?P<title>.*)</title>',
3672             webpage, u'title').strip()
3673
3674         # Get the embed page
3675         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3676         if result is None:
3677             raise ExtractorError(u'ERROR: unable to extract embed page')
3678
3679         embed_page_url = result.group(0).strip()
3680         video_id = result.group('videoid')
3681
3682         webpage = self._download_webpage(embed_page_url, video_id)
3683
3684         # Get the video URL
3685         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3686             webpage, u'video URL')
3687
3688         info = {'id': video_id,
3689                 'url': video_url,
3690                 'title': video_title,
3691                 'ext': 'flv',
3692                 'format': 'flv',
3693                 'player_url': embed_page_url}
3694
3695         return [info]
3696
3697 class EightTracksIE(InfoExtractor):
3698     IE_NAME = '8tracks'
3699     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3700
3701     def _real_extract(self, url):
3702         mobj = re.match(self._VALID_URL, url)
3703         if mobj is None:
3704             raise ExtractorError(u'Invalid URL: %s' % url)
3705         playlist_id = mobj.group('id')
3706
3707         webpage = self._download_webpage(url, playlist_id)
3708
3709         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3710         data = json.loads(json_like)
3711
3712         session = str(random.randint(0, 1000000000))
3713         mix_id = data['id']
3714         track_count = data['tracks_count']
3715         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3716         next_url = first_url
3717         res = []
3718         for i in itertools.count():
3719             api_json = self._download_webpage(next_url, playlist_id,
3720                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3721                 errnote=u'Failed to download song information')
3722             api_data = json.loads(api_json)
3723             track_data = api_data[u'set']['track']
3724             info = {
3725                 'id': track_data['id'],
3726                 'url': track_data['track_file_stream_url'],
3727                 'title': track_data['performer'] + u' - ' + track_data['name'],
3728                 'raw_title': track_data['name'],
3729                 'uploader_id': data['user']['login'],
3730                 'ext': 'm4a',
3731             }
3732             res.append(info)
3733             if api_data['set']['at_last_track']:
3734                 break
3735             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3736         return res
3737
3738 class KeekIE(InfoExtractor):
3739     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3740     IE_NAME = u'keek'
3741
3742     def _real_extract(self, url):
3743         m = re.match(self._VALID_URL, url)
3744         video_id = m.group('videoID')
3745
3746         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3747         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3748         webpage = self._download_webpage(url, video_id)
3749
3750         video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3751             webpage, u'title')
3752         video_title = unescapeHTML(video_title)
3753
3754         uploader = self._search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3755             webpage, u'uploader', fatal=False)
3756         if uploader: uploader = clean_html(uploader)
3757
3758         info = {
3759                 'id': video_id,
3760                 'url': video_url,
3761                 'ext': 'mp4',
3762                 'title': video_title,
3763                 'thumbnail': thumbnail,
3764                 'uploader': uploader
3765         }
3766         return [info]
3767
3768 class TEDIE(InfoExtractor):
3769     _VALID_URL=r'''http://www\.ted\.com/
3770                    (
3771                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3772                         |
3773                         ((?P<type_talk>talks)) # We have a simple talk
3774                    )
3775                    (/lang/(.*?))? # The url may contain the language
3776                    /(?P<name>\w+) # Here goes the name and then ".html"
3777                    '''
3778
3779     @classmethod
3780     def suitable(cls, url):
3781         """Receives a URL and returns True if suitable for this IE."""
3782         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3783
3784     def _real_extract(self, url):
3785         m=re.match(self._VALID_URL, url, re.VERBOSE)
3786         if m.group('type_talk'):
3787             return [self._talk_info(url)]
3788         else :
3789             playlist_id=m.group('playlist_id')
3790             name=m.group('name')
3791             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3792             return [self._playlist_videos_info(url,name,playlist_id)]
3793
3794     def _talk_video_link(self,mediaSlug):
3795         '''Returns the video link for that mediaSlug'''
3796         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3797
3798     def _playlist_videos_info(self,url,name,playlist_id=0):
3799         '''Returns the videos of the playlist'''
3800         video_RE=r'''
3801                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3802                      ([.\s]*?)data-playlist_item_id="(\d+)"
3803                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3804                      '''
3805         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3806         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3807         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3808         m_names=re.finditer(video_name_RE,webpage)
3809
3810         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3811         m_playlist = re.search(playlist_RE, webpage)
3812         playlist_title = m_playlist.group('playlist_title')
3813
3814         playlist_entries = []
3815         for m_video, m_name in zip(m_videos,m_names):
3816             video_id=m_video.group('video_id')
3817             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3818             playlist_entries.append(self.url_result(talk_url, 'TED'))
3819         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3820
3821     def _talk_info(self, url, video_id=0):
3822         """Return the video for the talk in the url"""
3823         m=re.match(self._VALID_URL, url,re.VERBOSE)
3824         videoName=m.group('name')
3825         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3826         # If the url includes the language we get the title translated
3827         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3828         title=re.search(title_RE, webpage).group('title')
3829         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3830                         "id":(?P<videoID>[\d]+).*?
3831                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3832         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3833         thumb_match=re.search(thumb_RE,webpage)
3834         info_match=re.search(info_RE,webpage,re.VERBOSE)
3835         video_id=info_match.group('videoID')
3836         mediaSlug=info_match.group('mediaSlug')
3837         video_url=self._talk_video_link(mediaSlug)
3838         info = {
3839                 'id': video_id,
3840                 'url': video_url,
3841                 'ext': 'mp4',
3842                 'title': title,
3843                 'thumbnail': thumb_match.group('thumbnail')
3844                 }
3845         return info
3846
3847 class MySpassIE(InfoExtractor):
3848     _VALID_URL = r'http://www.myspass.de/.*'
3849
3850     def _real_extract(self, url):
3851         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3852
3853         # video id is the last path element of the URL
3854         # usually there is a trailing slash, so also try the second but last
3855         url_path = compat_urllib_parse_urlparse(url).path
3856         url_parent_path, video_id = os.path.split(url_path)
3857         if not video_id:
3858             _, video_id = os.path.split(url_parent_path)
3859
3860         # get metadata
3861         metadata_url = META_DATA_URL_TEMPLATE % video_id
3862         metadata_text = self._download_webpage(metadata_url, video_id)
3863         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3864
3865         # extract values from metadata
3866         url_flv_el = metadata.find('url_flv')
3867         if url_flv_el is None:
3868             raise ExtractorError(u'Unable to extract download url')
3869         video_url = url_flv_el.text
3870         extension = os.path.splitext(video_url)[1][1:]
3871         title_el = metadata.find('title')
3872         if title_el is None:
3873             raise ExtractorError(u'Unable to extract title')
3874         title = title_el.text
3875         format_id_el = metadata.find('format_id')
3876         if format_id_el is None:
3877             format = ext
3878         else:
3879             format = format_id_el.text
3880         description_el = metadata.find('description')
3881         if description_el is not None:
3882             description = description_el.text
3883         else:
3884             description = None
3885         imagePreview_el = metadata.find('imagePreview')
3886         if imagePreview_el is not None:
3887             thumbnail = imagePreview_el.text
3888         else:
3889             thumbnail = None
3890         info = {
3891             'id': video_id,
3892             'url': video_url,
3893             'title': title,
3894             'ext': extension,
3895             'format': format,
3896             'thumbnail': thumbnail,
3897             'description': description
3898         }
3899         return [info]
3900
3901 class SpiegelIE(InfoExtractor):
3902     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3903
3904     def _real_extract(self, url):
3905         m = re.match(self._VALID_URL, url)
3906         video_id = m.group('videoID')
3907
3908         webpage = self._download_webpage(url, video_id)
3909
3910         video_title = self._search_regex(r'<div class="module-title">(.*?)</div>',
3911             webpage, u'title')
3912         video_title = unescapeHTML(video_title)
3913
3914         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3915         xml_code = self._download_webpage(xml_url, video_id,
3916                     note=u'Downloading XML', errnote=u'Failed to download XML')
3917
3918         idoc = xml.etree.ElementTree.fromstring(xml_code)
3919         last_type = idoc[-1]
3920         filename = last_type.findall('./filename')[0].text
3921         duration = float(last_type.findall('./duration')[0].text)
3922
3923         video_url = 'http://video2.spiegel.de/flash/' + filename
3924         video_ext = filename.rpartition('.')[2]
3925         info = {
3926             'id': video_id,
3927             'url': video_url,
3928             'ext': video_ext,
3929             'title': video_title,
3930             'duration': duration,
3931         }
3932         return [info]
3933
3934 class LiveLeakIE(InfoExtractor):
3935
3936     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3937     IE_NAME = u'liveleak'
3938
3939     def _real_extract(self, url):
3940         mobj = re.match(self._VALID_URL, url)
3941         if mobj is None:
3942             raise ExtractorError(u'Invalid URL: %s' % url)
3943
3944         video_id = mobj.group('video_id')
3945
3946         webpage = self._download_webpage(url, video_id)
3947
3948         video_url = self._search_regex(r'file: "(.*?)",',
3949             webpage, u'video URL')
3950
3951         video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3952             webpage, u'title')
3953         video_title = unescapeHTML(video_title).replace('LiveLeak.com -', '').strip()
3954
3955         video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3956             webpage, u'description', fatal=False)
3957         if video_description: video_description = unescapeHTML(video_description)
3958
3959         video_uploader = self._search_regex(r'By:.*?(\w+)</a>',
3960             webpage, u'uploader', fatal=False)
3961
3962         info = {
3963             'id':  video_id,
3964             'url': video_url,
3965             'ext': 'mp4',
3966             'title': video_title,
3967             'description': video_description,
3968             'uploader': video_uploader
3969         }
3970
3971         return [info]
3972
3973 class ARDIE(InfoExtractor):
3974     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3975     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3976     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3977
3978     def _real_extract(self, url):
3979         # determine video id from url
3980         m = re.match(self._VALID_URL, url)
3981
3982         numid = re.search(r'documentId=([0-9]+)', url)
3983         if numid:
3984             video_id = numid.group(1)
3985         else:
3986             video_id = m.group('video_id')
3987
3988         # determine title and media streams from webpage
3989         html = self._download_webpage(url, video_id)
3990         title = re.search(self._TITLE, html).group('title')
3991         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3992         if not streams:
3993             assert '"fsk"' in html
3994             raise ExtractorError(u'This video is only available after 8:00 pm')
3995
3996         # choose default media type and highest quality for now
3997         stream = max([s for s in streams if int(s["media_type"]) == 0],
3998                      key=lambda s: int(s["quality"]))
3999
4000         # there's two possibilities: RTMP stream or HTTP download
4001         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4002         if stream['rtmp_url']:
4003             self.to_screen(u'RTMP download detected')
4004             assert stream['video_url'].startswith('mp4:')
4005             info["url"] = stream["rtmp_url"]
4006             info["play_path"] = stream['video_url']
4007         else:
4008             assert stream["video_url"].endswith('.mp4')
4009             info["url"] = stream["video_url"]
4010         return [info]
4011
4012 class TumblrIE(InfoExtractor):
4013     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4014
4015     def _real_extract(self, url):
4016         m_url = re.match(self._VALID_URL, url)
4017         video_id = m_url.group('id')
4018         blog = m_url.group('blog_name')
4019
4020         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4021         webpage = self._download_webpage(url, video_id)
4022
4023         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4024         video = re.search(re_video, webpage)
4025         if video is None:
4026            raise ExtractorError(u'Unable to extract video')
4027         video_url = video.group('video_url')
4028         ext = video.group('ext')
4029
4030         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4031             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4032         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4033
4034         # The only place where you can get a title, it's not complete,
4035         # but searching in other places doesn't work for all videos
4036         video_title = self._search_regex(r'<title>(?P<title>.*?)</title>',
4037             webpage, u'title', flags=re.DOTALL)
4038         video_title = unescapeHTML(video_title)
4039
4040         return [{'id': video_id,
4041                  'url': video_url,
4042                  'title': video_title,
4043                  'thumbnail': video_thumbnail,
4044                  'ext': ext
4045                  }]
4046
4047 class BandcampIE(InfoExtractor):
4048     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4049
4050     def _real_extract(self, url):
4051         mobj = re.match(self._VALID_URL, url)
4052         title = mobj.group('title')
4053         webpage = self._download_webpage(url, title)
4054         # We get the link to the free download page
4055         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4056         if m_download is None:
4057             raise ExtractorError(u'No free songs found')
4058
4059         download_link = m_download.group(1)
4060         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4061                        webpage, re.MULTILINE|re.DOTALL).group('id')
4062
4063         download_webpage = self._download_webpage(download_link, id,
4064                                                   'Downloading free downloads page')
4065         # We get the dictionary of the track from some javascrip code
4066         info = re.search(r'items: (.*?),$',
4067                          download_webpage, re.MULTILINE).group(1)
4068         info = json.loads(info)[0]
4069         # We pick mp3-320 for now, until format selection can be easily implemented.
4070         mp3_info = info[u'downloads'][u'mp3-320']
4071         # If we try to use this url it says the link has expired
4072         initial_url = mp3_info[u'url']
4073         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4074         m_url = re.match(re_url, initial_url)
4075         #We build the url we will use to get the final track url
4076         # This url is build in Bandcamp in the script download_bunde_*.js
4077         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4078         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4079         # If we could correctly generate the .rand field the url would be
4080         #in the "download_url" key
4081         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4082
4083         track_info = {'id':id,
4084                       'title' : info[u'title'],
4085                       'ext' :   'mp3',
4086                       'url' :   final_url,
4087                       'thumbnail' : info[u'thumb_url'],
4088                       'uploader' :  info[u'artist']
4089                       }
4090
4091         return [track_info]
4092
4093 class RedTubeIE(InfoExtractor):
4094     """Information Extractor for redtube"""
4095     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4096
4097     def _real_extract(self,url):
4098         mobj = re.match(self._VALID_URL, url)
4099         if mobj is None:
4100             raise ExtractorError(u'Invalid URL: %s' % url)
4101
4102         video_id = mobj.group('id')
4103         video_extension = 'mp4'
4104         webpage = self._download_webpage(url, video_id)
4105
4106         self.report_extraction(video_id)
4107
4108         video_url = self._search_regex(r'<source src="(.+?)" type="video/mp4">',
4109             webpage, u'video URL')
4110
4111         video_title = self._search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4112             webpage, u'title')
4113
4114         return [{
4115             'id':       video_id,
4116             'url':      video_url,
4117             'ext':      video_extension,
4118             'title':    video_title,
4119         }]
4120
4121 class InaIE(InfoExtractor):
4122     """Information Extractor for Ina.fr"""
4123     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4124
4125     def _real_extract(self,url):
4126         mobj = re.match(self._VALID_URL, url)
4127
4128         video_id = mobj.group('id')
4129         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4130         video_extension = 'mp4'
4131         webpage = self._download_webpage(mrss_url, video_id)
4132
4133         self.report_extraction(video_id)
4134
4135         video_url = self._search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4136             webpage, u'video URL')
4137
4138         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4139             webpage, u'title')
4140
4141         return [{
4142             'id':       video_id,
4143             'url':      video_url,
4144             'ext':      video_extension,
4145             'title':    video_title,
4146         }]
4147
4148 class HowcastIE(InfoExtractor):
4149     """Information Extractor for Howcast.com"""
4150     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4151
4152     def _real_extract(self, url):
4153         mobj = re.match(self._VALID_URL, url)
4154
4155         video_id = mobj.group('id')
4156         webpage_url = 'http://www.howcast.com/videos/' + video_id
4157         webpage = self._download_webpage(webpage_url, video_id)
4158
4159         self.report_extraction(video_id)
4160
4161         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4162             webpage, u'video URL')
4163
4164         video_title = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4165             webpage, u'title')
4166
4167         video_description = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4168             webpage, u'description', fatal=False)
4169
4170         thumbnail = self._search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4171             webpage, u'thumbnail', fatal=False)
4172
4173         return [{
4174             'id':       video_id,
4175             'url':      video_url,
4176             'ext':      'mp4',
4177             'title':    video_title,
4178             'description': video_description,
4179             'thumbnail': thumbnail,
4180         }]
4181
4182 class VineIE(InfoExtractor):
4183     """Information Extractor for Vine.co"""
4184     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4185
4186     def _real_extract(self, url):
4187         mobj = re.match(self._VALID_URL, url)
4188
4189         video_id = mobj.group('id')
4190         webpage_url = 'https://vine.co/v/' + video_id
4191         webpage = self._download_webpage(webpage_url, video_id)
4192
4193         self.report_extraction(video_id)
4194
4195         video_url = self._search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4196             webpage, u'video URL')
4197
4198         video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
4199             webpage, u'title')
4200
4201         thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4202             webpage, u'thumbnail', fatal=False)
4203
4204         uploader = self._search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4205             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4206
4207         return [{
4208             'id':        video_id,
4209             'url':       video_url,
4210             'ext':       'mp4',
4211             'title':     video_title,
4212             'thumbnail': thumbnail,
4213             'uploader':  uploader,
4214         }]
4215
4216 class FlickrIE(InfoExtractor):
4217     """Information Extractor for Flickr videos"""
4218     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4219
4220     def _real_extract(self, url):
4221         mobj = re.match(self._VALID_URL, url)
4222
4223         video_id = mobj.group('id')
4224         video_uploader_id = mobj.group('uploader_id')
4225         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4226         webpage = self._download_webpage(webpage_url, video_id)
4227
4228         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4229
4230         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4231         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4232
4233         node_id = self._search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4234             first_xml, u'node_id')
4235
4236         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4237         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4238
4239         self.report_extraction(video_id)
4240
4241         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4242         if mobj is None:
4243             raise ExtractorError(u'Unable to extract video url')
4244         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4245
4246         video_title = self._search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4247             webpage, u'video title')
4248
4249         video_description = self._search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4250             webpage, u'description', fatal=False)
4251
4252         thumbnail = self._search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4253             webpage, u'thumbnail', fatal=False)
4254
4255         return [{
4256             'id':          video_id,
4257             'url':         video_url,
4258             'ext':         'mp4',
4259             'title':       video_title,
4260             'description': video_description,
4261             'thumbnail':   thumbnail,
4262             'uploader_id': video_uploader_id,
4263         }]
4264
4265 class TeamcocoIE(InfoExtractor):
4266     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4267
4268     def _real_extract(self, url):
4269         mobj = re.match(self._VALID_URL, url)
4270         if mobj is None:
4271             raise ExtractorError(u'Invalid URL: %s' % url)
4272         url_title = mobj.group('url_title')
4273         webpage = self._download_webpage(url, url_title)
4274
4275         video_id = self._search_regex(r'<article class="video" data-id="(\d+?)"',
4276             webpage, u'video id')
4277
4278         self.report_extraction(video_id)
4279
4280         video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
4281             webpage, u'title')
4282
4283         thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)"',
4284             webpage, u'thumbnail', fatal=False)
4285
4286         video_description = self._search_regex(r'<meta property="og:description" content="(.*?)"',
4287             webpage, u'description', fatal=False)
4288
4289         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4290         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4291
4292         video_url = self._search_regex(r'<file type="high".*?>(.*?)</file>',
4293             data, u'video URL')
4294
4295         return [{
4296             'id':          video_id,
4297             'url':         video_url,
4298             'ext':         'mp4',
4299             'title':       video_title,
4300             'thumbnail':   thumbnail,
4301             'description': video_description,
4302         }]
4303
4304 class XHamsterIE(InfoExtractor):
4305     """Information Extractor for xHamster"""
4306     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4307
4308     def _real_extract(self,url):
4309         mobj = re.match(self._VALID_URL, url)
4310
4311         video_id = mobj.group('id')
4312         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4313         webpage = self._download_webpage(mrss_url, video_id)
4314
4315         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4316         if mobj is None:
4317             raise ExtractorError(u'Unable to extract media URL')
4318         if len(mobj.group('server')) == 0:
4319             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4320         else:
4321             video_url = mobj.group('server')+'/key='+mobj.group('file')
4322         video_extension = video_url.split('.')[-1]
4323
4324         video_title = self._search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4325             webpage, u'title')
4326         video_title = unescapeHTML(video_title)
4327
4328         # Can't see the description anywhere in the UI
4329         # video_description = self._search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4330         #     webpage, u'description', fatal=False)
4331         # if video_description: video_description = unescapeHTML(video_description)
4332
4333         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4334         if mobj:
4335             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4336         else:
4337             video_upload_date = None
4338             self._downloader.report_warning(u'Unable to extract upload date')
4339
4340         video_uploader_id = self._search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)',
4341             webpage, u'uploader id', default=u'anonymous')
4342
4343         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4344             webpage, u'thumbnail', fatal=False)
4345
4346         return [{
4347             'id':       video_id,
4348             'url':      video_url,
4349             'ext':      video_extension,
4350             'title':    video_title,
4351             # 'description': video_description,
4352             'upload_date': video_upload_date,
4353             'uploader_id': video_uploader_id,
4354             'thumbnail': video_thumbnail
4355         }]
4356
4357 class HypemIE(InfoExtractor):
4358     """Information Extractor for hypem"""
4359     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4360
4361     def _real_extract(self, url):
4362         mobj = re.match(self._VALID_URL, url)
4363         if mobj is None:
4364             raise ExtractorError(u'Invalid URL: %s' % url)
4365         track_id = mobj.group(1)
4366
4367         data = { 'ax': 1, 'ts': time.time() }
4368         data_encoded = compat_urllib_parse.urlencode(data)
4369         complete_url = url + "?" + data_encoded
4370         request = compat_urllib_request.Request(complete_url)
4371         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4372         cookie = urlh.headers.get('Set-Cookie', '')
4373
4374         self.report_extraction(track_id)
4375
4376         html_tracks = self._search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4377             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4378         try:
4379             track_list = json.loads(html_tracks)
4380             track = track_list[u'tracks'][0]
4381         except ValueError:
4382             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4383
4384         key = track[u"key"]
4385         track_id = track[u"id"]
4386         artist = track[u"artist"]
4387         title = track[u"song"]
4388
4389         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4390         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4391         request.add_header('cookie', cookie)
4392         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4393         try:
4394             song_data = json.loads(song_data_json)
4395         except ValueError:
4396             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4397         final_url = song_data[u"url"]
4398
4399         return [{
4400             'id':       track_id,
4401             'url':      final_url,
4402             'ext':      "mp3",
4403             'title':    title,
4404             'artist':   artist,
4405         }]
4406
4407
4408 def gen_extractors():
4409     """ Return a list of an instance of every supported extractor.
4410     The order does matter; the first extractor matched is the one handling the URL.
4411     """
4412     return [
4413         YoutubePlaylistIE(),
4414         YoutubeChannelIE(),
4415         YoutubeUserIE(),
4416         YoutubeSearchIE(),
4417         YoutubeIE(),
4418         MetacafeIE(),
4419         DailymotionIE(),
4420         GoogleSearchIE(),
4421         PhotobucketIE(),
4422         YahooIE(),
4423         YahooSearchIE(),
4424         DepositFilesIE(),
4425         FacebookIE(),
4426         BlipTVIE(),
4427         BlipTVUserIE(),
4428         VimeoIE(),
4429         MyVideoIE(),
4430         ComedyCentralIE(),
4431         EscapistIE(),
4432         CollegeHumorIE(),
4433         XVideosIE(),
4434         SoundcloudSetIE(),
4435         SoundcloudIE(),
4436         InfoQIE(),
4437         MixcloudIE(),
4438         StanfordOpenClassroomIE(),
4439         MTVIE(),
4440         YoukuIE(),
4441         XNXXIE(),
4442         YouJizzIE(),
4443         PornotubeIE(),
4444         YouPornIE(),
4445         GooglePlusIE(),
4446         ArteTvIE(),
4447         NBAIE(),
4448         WorldStarHipHopIE(),
4449         JustinTVIE(),
4450         FunnyOrDieIE(),
4451         SteamIE(),
4452         UstreamIE(),
4453         RBMARadioIE(),
4454         EightTracksIE(),
4455         KeekIE(),
4456         TEDIE(),
4457         MySpassIE(),
4458         SpiegelIE(),
4459         LiveLeakIE(),
4460         ARDIE(),
4461         TumblrIE(),
4462         BandcampIE(),
4463         RedTubeIE(),
4464         InaIE(),
4465         HowcastIE(),
4466         VineIE(),
4467         FlickrIE(),
4468         TeamcocoIE(),
4469         XHamsterIE(),
4470         HypemIE(),
4471         GenericIE()
4472     ]
4473
4474 def get_info_extractor(ie_name):
4475     """Returns the info extractor class with the given ie_name"""
4476     return globals()[ie_name+'IE']