youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194 class SearchInfoExtractor(InfoExtractor):
 195     """
 196     Base class for paged search queries extractors.
 197     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 198     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 199     """
 200
 201     @classmethod
 202     def _make_valid_url(cls):
 203         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 204
 205     @classmethod
 206     def suitable(cls, url):
 207         return re.match(cls._make_valid_url(), url) is not None
 208
 209     def _real_extract(self, query):
 210         mobj = re.match(self._make_valid_url(), query)
 211         if mobj is None:
 212             raise ExtractorError(u'Invalid search query "%s"' % query)
 213
 214         prefix = mobj.group('prefix')
 215         query = mobj.group('query')
 216         if prefix == '':
 217             return self._get_n_results(query, 1)
 218         elif prefix == 'all':
 219             return self._get_n_results(query, self._MAX_RESULTS)
 220         else:
 221             n = int(prefix)
 222             if n <= 0:
 223                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 224             elif n > self._MAX_RESULTS:
 225                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 226                 n = self._MAX_RESULTS
 227             return self._get_n_results(query, n)
 228
 229     def _get_n_results(self, query, n):
 230         """Get a specified number of results for a query"""
 231         raise NotImplementedError("This method must be implemented by sublclasses")
 232
 233
 234 class YoutubeIE(InfoExtractor):
 235     """Information extractor for youtube.com."""
 236
 237     _VALID_URL = r"""^
 238                      (
 239                          (?:https?://)?                                       # http(s):// (optional)
 240                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 241                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 242                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 243                          (?:                                                  # the various things that can precede the ID:
 244                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 245                              |(?:                                             # or the v= param in all its forms
 246                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 247                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 248                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 249                                  v=
 250                              )
 251                          )?                                                   # optional -> youtube.com/xxxx is OK
 252                      )?                                                       # all until now is optional -> you can pass the naked ID
 253                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 254                      (?(1).+)?                                                # if we found the ID, everything can follow
 255                      $"""
 256     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 257     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 258     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 259     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 260     _NETRC_MACHINE = 'youtube'
 261     # Listed in order of quality
 262     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 263     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 264     _video_extensions = {
 265         '13': '3gp',
 266         '17': 'mp4',
 267         '18': 'mp4',
 268         '22': 'mp4',
 269         '37': 'mp4',
 270         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 271         '43': 'webm',
 272         '44': 'webm',
 273         '45': 'webm',
 274         '46': 'webm',
 275     }
 276     _video_dimensions = {
 277         '5': '240x400',
 278         '6': '???',
 279         '13': '???',
 280         '17': '144x176',
 281         '18': '360x640',
 282         '22': '720x1280',
 283         '34': '360x640',
 284         '35': '480x854',
 285         '37': '1080x1920',
 286         '38': '3072x4096',
 287         '43': '360x640',
 288         '44': '480x854',
 289         '45': '720x1280',
 290         '46': '1080x1920',
 291     }
 292     IE_NAME = u'youtube'
 293
 294     @classmethod
 295     def suitable(cls, url):
 296         """Receives a URL and returns True if suitable for this IE."""
 297         if YoutubePlaylistIE.suitable(url): return False
 298         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 299
 300     def report_lang(self):
 301         """Report attempt to set language."""
 302         self.to_screen(u'Setting language')
 303
 304     def report_login(self):
 305         """Report attempt to log in."""
 306         self.to_screen(u'Logging in')
 307
 308     def report_video_webpage_download(self, video_id):
 309         """Report attempt to download video webpage."""
 310         self.to_screen(u'%s: Downloading video webpage' % video_id)
 311
 312     def report_video_info_webpage_download(self, video_id):
 313         """Report attempt to download video info webpage."""
 314         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 315
 316     def report_video_subtitles_download(self, video_id):
 317         """Report attempt to download video info webpage."""
 318         self.to_screen(u'%s: Checking available subtitles' % video_id)
 319
 320     def report_video_subtitles_request(self, video_id, sub_lang, format):
 321         """Report attempt to download video info webpage."""
 322         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 323
 324     def report_video_subtitles_available(self, video_id, sub_lang_list):
 325         """Report available subtitles."""
 326         sub_lang = ",".join(list(sub_lang_list.keys()))
 327         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 328
 329     def report_information_extraction(self, video_id):
 330         """Report attempt to extract video information."""
 331         self.to_screen(u'%s: Extracting video information' % video_id)
 332
 333     def report_unavailable_format(self, video_id, format):
 334         """Report extracted video URL."""
 335         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 336
 337     def report_rtmp_download(self):
 338         """Indicate the download will use the RTMP protocol."""
 339         self.to_screen(u'RTMP download detected')
 340
 341     def _get_available_subtitles(self, video_id):
 342         self.report_video_subtitles_download(video_id)
 343         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 344         try:
 345             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 346         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 347             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 348         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 349         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 350         if not sub_lang_list:
 351             return (u'video doesn\'t have subtitles', None)
 352         return sub_lang_list
 353
 354     def _list_available_subtitles(self, video_id):
 355         sub_lang_list = self._get_available_subtitles(video_id)
 356         self.report_video_subtitles_available(video_id, sub_lang_list)
 357
 358     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 359         """
 360         Return tuple:
 361         (error_message, sub_lang, sub)
 362         """
 363         self.report_video_subtitles_request(video_id, sub_lang, format)
 364         params = compat_urllib_parse.urlencode({
 365             'lang': sub_lang,
 366             'name': sub_name,
 367             'v': video_id,
 368             'fmt': format,
 369         })
 370         url = 'http://www.youtube.com/api/timedtext?' + params
 371         try:
 372             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 374             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 375         if not sub:
 376             return (u'Did not fetch video subtitles', None, None)
 377         return (None, sub_lang, sub)
 378
 379     def _request_automatic_caption(self, video_id, webpage):
 380         """We need the webpage for getting the captions url, pass it as an
 381            argument to speed up the process."""
 382         sub_lang = self._downloader.params.get('subtitleslang')
 383         sub_format = self._downloader.params.get('subtitlesformat')
 384         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 385         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 386         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 387         if mobj is None:
 388             return [(err_msg, None, None)]
 389         player_config = json.loads(mobj.group(1))
 390         try:
 391             args = player_config[u'args']
 392             caption_url = args[u'ttsurl']
 393             timestamp = args[u'timestamp']
 394             params = compat_urllib_parse.urlencode({
 395                 'lang': 'en',
 396                 'tlang': sub_lang,
 397                 'fmt': sub_format,
 398                 'ts': timestamp,
 399                 'kind': 'asr',
 400             })
 401             subtitles_url = caption_url + '&' + params
 402             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 403             return [(None, sub_lang, sub)]
 404         except KeyError:
 405             return [(err_msg, None, None)]
 406
 407     def _extract_subtitle(self, video_id):
 408         """
 409         Return a list with a tuple:
 410         [(error_message, sub_lang, sub)]
 411         """
 412         sub_lang_list = self._get_available_subtitles(video_id)
 413         sub_format = self._downloader.params.get('subtitlesformat')
 414         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 415             return [(sub_lang_list[0], None, None)]
 416         if self._downloader.params.get('subtitleslang', False):
 417             sub_lang = self._downloader.params.get('subtitleslang')
 418         elif 'en' in sub_lang_list:
 419             sub_lang = 'en'
 420         else:
 421             sub_lang = list(sub_lang_list.keys())[0]
 422         if not sub_lang in sub_lang_list:
 423             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 424
 425         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 426         return [subtitle]
 427
 428     def _extract_all_subtitles(self, video_id):
 429         sub_lang_list = self._get_available_subtitles(video_id)
 430         sub_format = self._downloader.params.get('subtitlesformat')
 431         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 432             return [(sub_lang_list[0], None, None)]
 433         subtitles = []
 434         for sub_lang in sub_lang_list:
 435             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 436             subtitles.append(subtitle)
 437         return subtitles
 438
 439     def _print_formats(self, formats):
 440         print('Available formats:')
 441         for x in formats:
 442             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 443
 444     def _real_initialize(self):
 445         if self._downloader is None:
 446             return
 447
 448         username = None
 449         password = None
 450         downloader_params = self._downloader.params
 451
 452         # Attempt to use provided username and password or .netrc data
 453         if downloader_params.get('username', None) is not None:
 454             username = downloader_params['username']
 455             password = downloader_params['password']
 456         elif downloader_params.get('usenetrc', False):
 457             try:
 458                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 459                 if info is not None:
 460                     username = info[0]
 461                     password = info[2]
 462                 else:
 463                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 464             except (IOError, netrc.NetrcParseError) as err:
 465                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 466                 return
 467
 468         # Set language
 469         request = compat_urllib_request.Request(self._LANG_URL)
 470         try:
 471             self.report_lang()
 472             compat_urllib_request.urlopen(request).read()
 473         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 474             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 475             return
 476
 477         # No authentication to be performed
 478         if username is None:
 479             return
 480
 481         request = compat_urllib_request.Request(self._LOGIN_URL)
 482         try:
 483             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 484         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 485             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 486             return
 487
 488         galx = None
 489         dsh = None
 490         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 491         if match:
 492           galx = match.group(1)
 493
 494         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 495         if match:
 496           dsh = match.group(1)
 497
 498         # Log in
 499         login_form_strs = {
 500                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 501                 u'Email': username,
 502                 u'GALX': galx,
 503                 u'Passwd': password,
 504                 u'PersistentCookie': u'yes',
 505                 u'_utf8': u'霱',
 506                 u'bgresponse': u'js_disabled',
 507                 u'checkConnection': u'',
 508                 u'checkedDomains': u'youtube',
 509                 u'dnConn': u'',
 510                 u'dsh': dsh,
 511                 u'pstMsg': u'0',
 512                 u'rmShown': u'1',
 513                 u'secTok': u'',
 514                 u'signIn': u'Sign in',
 515                 u'timeStmp': u'',
 516                 u'service': u'youtube',
 517                 u'uilel': u'3',
 518                 u'hl': u'en_US',
 519         }
 520         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 521         # chokes on unicode
 522         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 523         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 524         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 525         try:
 526             self.report_login()
 527             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 528             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 529                 self._downloader.report_warning(u'unable to log in: bad username or password')
 530                 return
 531         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 532             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 533             return
 534
 535         # Confirm age
 536         age_form = {
 537                 'next_url':     '/',
 538                 'action_confirm':   'Confirm',
 539                 }
 540         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 541         try:
 542             self.report_age_confirmation()
 543             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 544         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 545             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 546
 547     def _extract_id(self, url):
 548         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 549         if mobj is None:
 550             raise ExtractorError(u'Invalid URL: %s' % url)
 551         video_id = mobj.group(2)
 552         return video_id
 553
 554     def _real_extract(self, url):
 555         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 556         mobj = re.search(self._NEXT_URL_RE, url)
 557         if mobj:
 558             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 559         video_id = self._extract_id(url)
 560
 561         # Get video webpage
 562         self.report_video_webpage_download(video_id)
 563         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 564         request = compat_urllib_request.Request(url)
 565         try:
 566             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 567         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 568             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 569
 570         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 571
 572         # Attempt to extract SWF player URL
 573         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 574         if mobj is not None:
 575             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 576         else:
 577             player_url = None
 578
 579         # Get video info
 580         self.report_video_info_webpage_download(video_id)
 581         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 582             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 583                     % (video_id, el_type))
 584             video_info_webpage = self._download_webpage(video_info_url, video_id,
 585                                     note=False,
 586                                     errnote='unable to download video info webpage')
 587             video_info = compat_parse_qs(video_info_webpage)
 588             if 'token' in video_info:
 589                 break
 590         if 'token' not in video_info:
 591             if 'reason' in video_info:
 592                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 593             else:
 594                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 595
 596         # Check for "rental" videos
 597         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 598             raise ExtractorError(u'"rental" videos not supported')
 599
 600         # Start extracting information
 601         self.report_information_extraction(video_id)
 602
 603         # uploader
 604         if 'author' not in video_info:
 605             raise ExtractorError(u'Unable to extract uploader name')
 606         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 607
 608         # uploader_id
 609         video_uploader_id = None
 610         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 611         if mobj is not None:
 612             video_uploader_id = mobj.group(1)
 613         else:
 614             self._downloader.report_warning(u'unable to extract uploader nickname')
 615
 616         # title
 617         if 'title' not in video_info:
 618             raise ExtractorError(u'Unable to extract video title')
 619         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 620
 621         # thumbnail image
 622         if 'thumbnail_url' not in video_info:
 623             self._downloader.report_warning(u'unable to extract video thumbnail')
 624             video_thumbnail = ''
 625         else:   # don't panic if we can't find it
 626             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 627
 628         # upload date
 629         upload_date = None
 630         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 631         if mobj is not None:
 632             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 633             upload_date = unified_strdate(upload_date)
 634
 635         # description
 636         video_description = get_element_by_id("eow-description", video_webpage)
 637         if video_description:
 638             video_description = clean_html(video_description)
 639         else:
 640             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 641             if fd_mobj:
 642                 video_description = unescapeHTML(fd_mobj.group(1))
 643             else:
 644                 video_description = u''
 645
 646         # subtitles
 647         video_subtitles = None
 648
 649         if self._downloader.params.get('writesubtitles', False):
 650             video_subtitles = self._extract_subtitle(video_id)
 651             if video_subtitles:
 652                 (sub_error, sub_lang, sub) = video_subtitles[0]
 653                 if sub_error:
 654                     # We try with the automatic captions
 655                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 656                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 657                     if sub is not None:
 658                         pass
 659                     else:
 660                         # We report the original error
 661                         self._downloader.report_error(sub_error)
 662
 663         if self._downloader.params.get('allsubtitles', False):
 664             video_subtitles = self._extract_all_subtitles(video_id)
 665             for video_subtitle in video_subtitles:
 666                 (sub_error, sub_lang, sub) = video_subtitle
 667                 if sub_error:
 668                     self._downloader.report_error(sub_error)
 669
 670         if self._downloader.params.get('listsubtitles', False):
 671             sub_lang_list = self._list_available_subtitles(video_id)
 672             return
 673
 674         if 'length_seconds' not in video_info:
 675             self._downloader.report_warning(u'unable to extract video duration')
 676             video_duration = ''
 677         else:
 678             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 679
 680         # token
 681         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 682
 683         # Decide which formats to download
 684         req_format = self._downloader.params.get('format', None)
 685
 686         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 687             self.report_rtmp_download()
 688             video_url_list = [(None, video_info['conn'][0])]
 689         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 690             url_map = {}
 691             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 692                 url_data = compat_parse_qs(url_data_str)
 693                 if 'itag' in url_data and 'url' in url_data:
 694                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 695                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 696                     url_map[url_data['itag'][0]] = url
 697
 698             format_limit = self._downloader.params.get('format_limit', None)
 699             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 700             if format_limit is not None and format_limit in available_formats:
 701                 format_list = available_formats[available_formats.index(format_limit):]
 702             else:
 703                 format_list = available_formats
 704             existing_formats = [x for x in format_list if x in url_map]
 705             if len(existing_formats) == 0:
 706                 raise ExtractorError(u'no known formats available for video')
 707             if self._downloader.params.get('listformats', None):
 708                 self._print_formats(existing_formats)
 709                 return
 710             if req_format is None or req_format == 'best':
 711                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 712             elif req_format == 'worst':
 713                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 714             elif req_format in ('-1', 'all'):
 715                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 716             else:
 717                 # Specific formats. We pick the first in a slash-delimeted sequence.
 718                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 719                 req_formats = req_format.split('/')
 720                 video_url_list = None
 721                 for rf in req_formats:
 722                     if rf in url_map:
 723                         video_url_list = [(rf, url_map[rf])]
 724                         break
 725                 if video_url_list is None:
 726                     raise ExtractorError(u'requested format not available')
 727         else:
 728             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 729
 730         results = []
 731         for format_param, video_real_url in video_url_list:
 732             # Extension
 733             video_extension = self._video_extensions.get(format_param, 'flv')
 734
 735             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 736                                               self._video_dimensions.get(format_param, '???'))
 737
 738             results.append({
 739                 'id':       video_id,
 740                 'url':      video_real_url,
 741                 'uploader': video_uploader,
 742                 'uploader_id': video_uploader_id,
 743                 'upload_date':  upload_date,
 744                 'title':    video_title,
 745                 'ext':      video_extension,
 746                 'format':   video_format,
 747                 'thumbnail':    video_thumbnail,
 748                 'description':  video_description,
 749                 'player_url':   player_url,
 750                 'subtitles':    video_subtitles,
 751                 'duration':     video_duration
 752             })
 753         return results
 754
 755
 756 class MetacafeIE(InfoExtractor):
 757     """Information Extractor for metacafe.com."""
 758
 759     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 760     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 761     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 762     IE_NAME = u'metacafe'
 763
 764     def report_disclaimer(self):
 765         """Report disclaimer retrieval."""
 766         self.to_screen(u'Retrieving disclaimer')
 767
 768     def _real_initialize(self):
 769         # Retrieve disclaimer
 770         request = compat_urllib_request.Request(self._DISCLAIMER)
 771         try:
 772             self.report_disclaimer()
 773             disclaimer = compat_urllib_request.urlopen(request).read()
 774         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 775             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 776
 777         # Confirm age
 778         disclaimer_form = {
 779             'filters': '0',
 780             'submit': "Continue - I'm over 18",
 781             }
 782         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 783         try:
 784             self.report_age_confirmation()
 785             disclaimer = compat_urllib_request.urlopen(request).read()
 786         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 787             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 788
 789     def _real_extract(self, url):
 790         # Extract id and simplified title from URL
 791         mobj = re.match(self._VALID_URL, url)
 792         if mobj is None:
 793             raise ExtractorError(u'Invalid URL: %s' % url)
 794
 795         video_id = mobj.group(1)
 796
 797         # Check if video comes from YouTube
 798         mobj2 = re.match(r'^yt-(.*)$', video_id)
 799         if mobj2 is not None:
 800             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 801
 802         # Retrieve video webpage to extract further information
 803         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 804
 805         # Extract URL, uploader and title from webpage
 806         self.report_extraction(video_id)
 807         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 808         if mobj is not None:
 809             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 810             video_extension = mediaURL[-3:]
 811
 812             # Extract gdaKey if available
 813             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 814             if mobj is None:
 815                 video_url = mediaURL
 816             else:
 817                 gdaKey = mobj.group(1)
 818                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 819         else:
 820             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 821             if mobj is None:
 822                 raise ExtractorError(u'Unable to extract media URL')
 823             vardict = compat_parse_qs(mobj.group(1))
 824             if 'mediaData' not in vardict:
 825                 raise ExtractorError(u'Unable to extract media URL')
 826             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 827             if mobj is None:
 828                 raise ExtractorError(u'Unable to extract media URL')
 829             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 830             video_extension = mediaURL[-3:]
 831             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 832
 833         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 834         if mobj is None:
 835             raise ExtractorError(u'Unable to extract title')
 836         video_title = mobj.group(1).decode('utf-8')
 837
 838         mobj = re.search(r'submitter=(.*?);', webpage)
 839         if mobj is None:
 840             raise ExtractorError(u'Unable to extract uploader nickname')
 841         video_uploader = mobj.group(1)
 842
 843         return [{
 844             'id':       video_id.decode('utf-8'),
 845             'url':      video_url.decode('utf-8'),
 846             'uploader': video_uploader.decode('utf-8'),
 847             'upload_date':  None,
 848             'title':    video_title,
 849             'ext':      video_extension.decode('utf-8'),
 850         }]
 851
 852 class DailymotionIE(InfoExtractor):
 853     """Information Extractor for Dailymotion"""
 854
 855     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 856     IE_NAME = u'dailymotion'
 857
 858     def _real_extract(self, url):
 859         # Extract id and simplified title from URL
 860         mobj = re.match(self._VALID_URL, url)
 861         if mobj is None:
 862             raise ExtractorError(u'Invalid URL: %s' % url)
 863
 864         video_id = mobj.group(1).split('_')[0].split('?')[0]
 865
 866         video_extension = 'mp4'
 867
 868         # Retrieve video webpage to extract further information
 869         request = compat_urllib_request.Request(url)
 870         request.add_header('Cookie', 'family_filter=off')
 871         webpage = self._download_webpage(request, video_id)
 872
 873         # Extract URL, uploader and title from webpage
 874         self.report_extraction(video_id)
 875         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 876         if mobj is None:
 877             raise ExtractorError(u'Unable to extract media URL')
 878         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 879
 880         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 881             if key in flashvars:
 882                 max_quality = key
 883                 self.to_screen(u'Using %s' % key)
 884                 break
 885         else:
 886             raise ExtractorError(u'Unable to extract video URL')
 887
 888         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 889         if mobj is None:
 890             raise ExtractorError(u'Unable to extract video URL')
 891
 892         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 893
 894         # TODO: support choosing qualities
 895
 896         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 897         if mobj is None:
 898             raise ExtractorError(u'Unable to extract title')
 899         video_title = unescapeHTML(mobj.group('title'))
 900
 901         video_uploader = None
 902         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 903         if mobj is None:
 904             # lookin for official user
 905             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 906             if mobj_official is None:
 907                 self._downloader.report_warning(u'unable to extract uploader nickname')
 908             else:
 909                 video_uploader = mobj_official.group(1)
 910         else:
 911             video_uploader = mobj.group(1)
 912
 913         video_upload_date = None
 914         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 915         if mobj is not None:
 916             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 917
 918         return [{
 919             'id':       video_id,
 920             'url':      video_url,
 921             'uploader': video_uploader,
 922             'upload_date':  video_upload_date,
 923             'title':    video_title,
 924             'ext':      video_extension,
 925         }]
 926
 927
 928 class PhotobucketIE(InfoExtractor):
 929     """Information extractor for photobucket.com."""
 930
 931     # TODO: the original _VALID_URL was:
 932     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 933     # Check if it's necessary to keep the old extracion process
 934     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 935     IE_NAME = u'photobucket'
 936
 937     def _real_extract(self, url):
 938         # Extract id from URL
 939         mobj = re.match(self._VALID_URL, url)
 940         if mobj is None:
 941             raise ExtractorError(u'Invalid URL: %s' % url)
 942
 943         video_id = mobj.group('id')
 944
 945         video_extension = mobj.group('ext')
 946
 947         # Retrieve video webpage to extract further information
 948         webpage = self._download_webpage(url, video_id)
 949
 950         # Extract URL, uploader, and title from webpage
 951         self.report_extraction(video_id)
 952         # We try first by looking the javascript code:
 953         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 954         if mobj is not None:
 955             info = json.loads(mobj.group('json'))
 956             return [{
 957                 'id':       video_id,
 958                 'url':      info[u'downloadUrl'],
 959                 'uploader': info[u'username'],
 960                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 961                 'title':    info[u'title'],
 962                 'ext':      video_extension,
 963                 'thumbnail': info[u'thumbUrl'],
 964             }]
 965
 966         # We try looking in other parts of the webpage
 967         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 968         if mobj is None:
 969             raise ExtractorError(u'Unable to extract media URL')
 970         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 971
 972         video_url = mediaURL
 973
 974         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 975         if mobj is None:
 976             raise ExtractorError(u'Unable to extract title')
 977         video_title = mobj.group(1).decode('utf-8')
 978
 979         video_uploader = mobj.group(2).decode('utf-8')
 980
 981         return [{
 982             'id':       video_id.decode('utf-8'),
 983             'url':      video_url.decode('utf-8'),
 984             'uploader': video_uploader,
 985             'upload_date':  None,
 986             'title':    video_title,
 987             'ext':      video_extension.decode('utf-8'),
 988         }]
 989
 990
 991 class YahooIE(InfoExtractor):
 992     """Information extractor for screen.yahoo.com."""
 993     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
 994
 995     def _real_extract(self, url):
 996         mobj = re.match(self._VALID_URL, url)
 997         if mobj is None:
 998             raise ExtractorError(u'Invalid URL: %s' % url)
 999         video_id = mobj.group('id')
1000         webpage = self._download_webpage(url, video_id)
1001         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1002
1003         if m_id is None:
1004             # TODO: Check which url parameters are required
1005             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1006             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1007             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1008                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1009                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1010                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1011                         '''
1012             self.report_extraction(video_id)
1013             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1014             if m_info is None:
1015                 raise ExtractorError(u'Unable to extract video info')
1016             video_title = m_info.group('title')
1017             video_description = m_info.group('description')
1018             video_thumb = m_info.group('thumb')
1019             video_date = m_info.group('date')
1020             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1021
1022             # TODO: Find a way to get mp4 videos
1023             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1024             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1025             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1026             video_url = m_rest.group('url')
1027             video_path = m_rest.group('path')
1028             if m_rest is None:
1029                 raise ExtractorError(u'Unable to extract video url')
1030
1031         else: # We have to use a different method if another id is defined
1032             long_id = m_id.group('new_id')
1033             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1034             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1035             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1036             info = json.loads(json_str)
1037             res = info[u'query'][u'results'][u'mediaObj'][0]
1038             stream = res[u'streams'][0]
1039             video_path = stream[u'path']
1040             video_url = stream[u'host']
1041             meta = res[u'meta']
1042             video_title = meta[u'title']
1043             video_description = meta[u'description']
1044             video_thumb = meta[u'thumbnail']
1045             video_date = None # I can't find it
1046
1047         info_dict = {
1048                      'id': video_id,
1049                      'url': video_url,
1050                      'play_path': video_path,
1051                      'title':video_title,
1052                      'description': video_description,
1053                      'thumbnail': video_thumb,
1054                      'upload_date': video_date,
1055                      'ext': 'flv',
1056                      }
1057         return info_dict
1058
1059 class VimeoIE(InfoExtractor):
1060     """Information extractor for vimeo.com."""
1061
1062     # _VALID_URL matches Vimeo URLs
1063     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1064     IE_NAME = u'vimeo'
1065
1066     def _real_extract(self, url, new_video=True):
1067         # Extract ID from URL
1068         mobj = re.match(self._VALID_URL, url)
1069         if mobj is None:
1070             raise ExtractorError(u'Invalid URL: %s' % url)
1071
1072         video_id = mobj.group('id')
1073         if not mobj.group('proto'):
1074             url = 'https://' + url
1075         if mobj.group('direct_link') or mobj.group('pro'):
1076             url = 'https://vimeo.com/' + video_id
1077
1078         # Retrieve video webpage to extract further information
1079         request = compat_urllib_request.Request(url, None, std_headers)
1080         webpage = self._download_webpage(request, video_id)
1081
1082         # Now we begin extracting as much information as we can from what we
1083         # retrieved. First we extract the information common to all extractors,
1084         # and latter we extract those that are Vimeo specific.
1085         self.report_extraction(video_id)
1086
1087         # Extract the config JSON
1088         try:
1089             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1090             config = json.loads(config)
1091         except:
1092             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1093                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1094             else:
1095                 raise ExtractorError(u'Unable to extract info section')
1096
1097         # Extract title
1098         video_title = config["video"]["title"]
1099
1100         # Extract uploader and uploader_id
1101         video_uploader = config["video"]["owner"]["name"]
1102         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1103
1104         # Extract video thumbnail
1105         video_thumbnail = config["video"]["thumbnail"]
1106
1107         # Extract video description
1108         video_description = get_element_by_attribute("itemprop", "description", webpage)
1109         if video_description: video_description = clean_html(video_description)
1110         else: video_description = u''
1111
1112         # Extract upload date
1113         video_upload_date = None
1114         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1115         if mobj is not None:
1116             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1117
1118         # Vimeo specific: extract request signature and timestamp
1119         sig = config['request']['signature']
1120         timestamp = config['request']['timestamp']
1121
1122         # Vimeo specific: extract video codec and quality information
1123         # First consider quality, then codecs, then take everything
1124         # TODO bind to format param
1125         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1126         files = { 'hd': [], 'sd': [], 'other': []}
1127         for codec_name, codec_extension in codecs:
1128             if codec_name in config["video"]["files"]:
1129                 if 'hd' in config["video"]["files"][codec_name]:
1130                     files['hd'].append((codec_name, codec_extension, 'hd'))
1131                 elif 'sd' in config["video"]["files"][codec_name]:
1132                     files['sd'].append((codec_name, codec_extension, 'sd'))
1133                 else:
1134                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1135
1136         for quality in ('hd', 'sd', 'other'):
1137             if len(files[quality]) > 0:
1138                 video_quality = files[quality][0][2]
1139                 video_codec = files[quality][0][0]
1140                 video_extension = files[quality][0][1]
1141                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1142                 break
1143         else:
1144             raise ExtractorError(u'No known codec found')
1145
1146         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1147                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1148
1149         return [{
1150             'id':       video_id,
1151             'url':      video_url,
1152             'uploader': video_uploader,
1153             'uploader_id': video_uploader_id,
1154             'upload_date':  video_upload_date,
1155             'title':    video_title,
1156             'ext':      video_extension,
1157             'thumbnail':    video_thumbnail,
1158             'description':  video_description,
1159         }]
1160
1161
1162 class ArteTvIE(InfoExtractor):
1163     """arte.tv information extractor."""
1164
1165     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1166     _LIVE_URL = r'index-[0-9]+\.html$'
1167
1168     IE_NAME = u'arte.tv'
1169
1170     def fetch_webpage(self, url):
1171         request = compat_urllib_request.Request(url)
1172         try:
1173             self.report_download_webpage(url)
1174             webpage = compat_urllib_request.urlopen(request).read()
1175         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1176             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1177         except ValueError as err:
1178             raise ExtractorError(u'Invalid URL: %s' % url)
1179         return webpage
1180
1181     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182         page = self.fetch_webpage(url)
1183         mobj = re.search(regex, page, regexFlags)
1184         info = {}
1185
1186         if mobj is None:
1187             raise ExtractorError(u'Invalid URL: %s' % url)
1188
1189         for (i, key, err) in matchTuples:
1190             if mobj.group(i) is None:
1191                 raise ExtractorError(err)
1192             else:
1193                 info[key] = mobj.group(i)
1194
1195         return info
1196
1197     def extractLiveStream(self, url):
1198         video_lang = url.split('/')[-4]
1199         info = self.grep_webpage(
1200             url,
1201             r'src="(.*?/videothek_js.*?\.js)',
1202             0,
1203             [
1204                 (1, 'url', u'Invalid URL: %s' % url)
1205             ]
1206         )
1207         http_host = url.split('/')[2]
1208         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1209         info = self.grep_webpage(
1210             next_url,
1211             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1212                 '(http://.*?\.swf).*?' +
1213                 '(rtmp://.*?)\'',
1214             re.DOTALL,
1215             [
1216                 (1, 'path',   u'could not extract video path: %s' % url),
1217                 (2, 'player', u'could not extract video player: %s' % url),
1218                 (3, 'url',    u'could not extract video url: %s' % url)
1219             ]
1220         )
1221         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1222
1223     def extractPlus7Stream(self, url):
1224         video_lang = url.split('/')[-3]
1225         info = self.grep_webpage(
1226             url,
1227             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1228             0,
1229             [
1230                 (1, 'url', u'Invalid URL: %s' % url)
1231             ]
1232         )
1233         next_url = compat_urllib_parse.unquote(info.get('url'))
1234         info = self.grep_webpage(
1235             next_url,
1236             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1237             0,
1238             [
1239                 (1, 'url', u'Could not find <video> tag: %s' % url)
1240             ]
1241         )
1242         next_url = compat_urllib_parse.unquote(info.get('url'))
1243
1244         info = self.grep_webpage(
1245             next_url,
1246             r'<video id="(.*?)".*?>.*?' +
1247                 '<name>(.*?)</name>.*?' +
1248                 '<dateVideo>(.*?)</dateVideo>.*?' +
1249                 '<url quality="hd">(.*?)</url>',
1250             re.DOTALL,
1251             [
1252                 (1, 'id',    u'could not extract video id: %s' % url),
1253                 (2, 'title', u'could not extract video title: %s' % url),
1254                 (3, 'date',  u'could not extract video date: %s' % url),
1255                 (4, 'url',   u'could not extract video url: %s' % url)
1256             ]
1257         )
1258
1259         return {
1260             'id':           info.get('id'),
1261             'url':          compat_urllib_parse.unquote(info.get('url')),
1262             'uploader':     u'arte.tv',
1263             'upload_date':  unified_strdate(info.get('date')),
1264             'title':        info.get('title').decode('utf-8'),
1265             'ext':          u'mp4',
1266             'format':       u'NA',
1267             'player_url':   None,
1268         }
1269
1270     def _real_extract(self, url):
1271         video_id = url.split('/')[-1]
1272         self.report_extraction(video_id)
1273
1274         if re.search(self._LIVE_URL, video_id) is not None:
1275             self.extractLiveStream(url)
1276             return
1277         else:
1278             info = self.extractPlus7Stream(url)
1279
1280         return [info]
1281
1282
1283 class GenericIE(InfoExtractor):
1284     """Generic last-resort information extractor."""
1285
1286     _VALID_URL = r'.*'
1287     IE_NAME = u'generic'
1288
1289     def report_download_webpage(self, video_id):
1290         """Report webpage download."""
1291         if not self._downloader.params.get('test', False):
1292             self._downloader.report_warning(u'Falling back on generic information extractor.')
1293         super(GenericIE, self).report_download_webpage(video_id)
1294
1295     def report_following_redirect(self, new_url):
1296         """Report information extraction."""
1297         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1298
1299     def _test_redirect(self, url):
1300         """Check if it is a redirect, like url shorteners, in case return the new url."""
1301         class HeadRequest(compat_urllib_request.Request):
1302             def get_method(self):
1303                 return "HEAD"
1304
1305         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1306             """
1307             Subclass the HTTPRedirectHandler to make it use our
1308             HeadRequest also on the redirected URL
1309             """
1310             def redirect_request(self, req, fp, code, msg, headers, newurl):
1311                 if code in (301, 302, 303, 307):
1312                     newurl = newurl.replace(' ', '%20')
1313                     newheaders = dict((k,v) for k,v in req.headers.items()
1314                                       if k.lower() not in ("content-length", "content-type"))
1315                     return HeadRequest(newurl,
1316                                        headers=newheaders,
1317                                        origin_req_host=req.get_origin_req_host(),
1318                                        unverifiable=True)
1319                 else:
1320                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1321
1322         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1323             """
1324             Fallback to GET if HEAD is not allowed (405 HTTP error)
1325             """
1326             def http_error_405(self, req, fp, code, msg, headers):
1327                 fp.read()
1328                 fp.close()
1329
1330                 newheaders = dict((k,v) for k,v in req.headers.items()
1331                                   if k.lower() not in ("content-length", "content-type"))
1332                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1333                                                  headers=newheaders,
1334                                                  origin_req_host=req.get_origin_req_host(),
1335                                                  unverifiable=True))
1336
1337         # Build our opener
1338         opener = compat_urllib_request.OpenerDirector()
1339         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1340                         HTTPMethodFallback, HEADRedirectHandler,
1341                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1342             opener.add_handler(handler())
1343
1344         response = opener.open(HeadRequest(url))
1345         if response is None:
1346             raise ExtractorError(u'Invalid URL protocol')
1347         new_url = response.geturl()
1348
1349         if url == new_url:
1350             return False
1351
1352         self.report_following_redirect(new_url)
1353         return new_url
1354
1355     def _real_extract(self, url):
1356         new_url = self._test_redirect(url)
1357         if new_url: return [self.url_result(new_url)]
1358
1359         video_id = url.split('/')[-1]
1360         try:
1361             webpage = self._download_webpage(url, video_id)
1362         except ValueError as err:
1363             # since this is the last-resort InfoExtractor, if
1364             # this error is thrown, it'll be thrown here
1365             raise ExtractorError(u'Invalid URL: %s' % url)
1366
1367         self.report_extraction(video_id)
1368         # Start with something easy: JW Player in SWFObject
1369         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1370         if mobj is None:
1371             # Broaden the search a little bit
1372             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1373         if mobj is None:
1374             # Broaden the search a little bit: JWPlayer JS loader
1375             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1376         if mobj is None:
1377             raise ExtractorError(u'Invalid URL: %s' % url)
1378
1379         # It's possible that one of the regexes
1380         # matched, but returned an empty group:
1381         if mobj.group(1) is None:
1382             raise ExtractorError(u'Invalid URL: %s' % url)
1383
1384         video_url = compat_urllib_parse.unquote(mobj.group(1))
1385         video_id = os.path.basename(video_url)
1386
1387         # here's a fun little line of code for you:
1388         video_extension = os.path.splitext(video_id)[1][1:]
1389         video_id = os.path.splitext(video_id)[0]
1390
1391         # it's tempting to parse this further, but you would
1392         # have to take into account all the variations like
1393         #   Video Title - Site Name
1394         #   Site Name | Video Title
1395         #   Video Title - Tagline | Site Name
1396         # and so on and so forth; it's just not practical
1397         mobj = re.search(r'<title>(.*)</title>', webpage)
1398         if mobj is None:
1399             raise ExtractorError(u'Unable to extract title')
1400         video_title = mobj.group(1)
1401
1402         # video uploader is domain name
1403         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1404         if mobj is None:
1405             raise ExtractorError(u'Unable to extract title')
1406         video_uploader = mobj.group(1)
1407
1408         return [{
1409             'id':       video_id,
1410             'url':      video_url,
1411             'uploader': video_uploader,
1412             'upload_date':  None,
1413             'title':    video_title,
1414             'ext':      video_extension,
1415         }]
1416
1417
1418 class YoutubeSearchIE(SearchInfoExtractor):
1419     """Information Extractor for YouTube search queries."""
1420     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1421     _MAX_RESULTS = 1000
1422     IE_NAME = u'youtube:search'
1423     _SEARCH_KEY = 'ytsearch'
1424
1425     def report_download_page(self, query, pagenum):
1426         """Report attempt to download search page with given number."""
1427         query = query.decode(preferredencoding())
1428         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1429
1430     def _get_n_results(self, query, n):
1431         """Get a specified number of results for a query"""
1432
1433         video_ids = []
1434         pagenum = 0
1435         limit = n
1436
1437         while (50 * pagenum) < limit:
1438             self.report_download_page(query, pagenum+1)
1439             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1440             request = compat_urllib_request.Request(result_url)
1441             try:
1442                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1443             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1444                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1445             api_response = json.loads(data)['data']
1446
1447             if not 'items' in api_response:
1448                 raise ExtractorError(u'[youtube] No video results')
1449
1450             new_ids = list(video['id'] for video in api_response['items'])
1451             video_ids += new_ids
1452
1453             limit = min(n, api_response['totalItems'])
1454             pagenum += 1
1455
1456         if len(video_ids) > n:
1457             video_ids = video_ids[:n]
1458         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1459         return self.playlist_result(videos, query)
1460
1461
1462 class GoogleSearchIE(SearchInfoExtractor):
1463     """Information Extractor for Google Video search queries."""
1464     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1465     _MAX_RESULTS = 1000
1466     IE_NAME = u'video.google:search'
1467     _SEARCH_KEY = 'gvsearch'
1468
1469     def _get_n_results(self, query, n):
1470         """Get a specified number of results for a query"""
1471
1472         res = {
1473             '_type': 'playlist',
1474             'id': query,
1475             'entries': []
1476         }
1477
1478         for pagenum in itertools.count(1):
1479             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1480             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1481                                              note='Downloading result page ' + str(pagenum))
1482
1483             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1484                 e = {
1485                     '_type': 'url',
1486                     'url': mobj.group(1)
1487                 }
1488                 res['entries'].append(e)
1489
1490             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1491                 return res
1492
1493 class YahooSearchIE(SearchInfoExtractor):
1494     """Information Extractor for Yahoo! Video search queries."""
1495
1496     _MAX_RESULTS = 1000
1497     IE_NAME = u'screen.yahoo:search'
1498     _SEARCH_KEY = 'yvsearch'
1499
1500     def _get_n_results(self, query, n):
1501         """Get a specified number of results for a query"""
1502
1503         res = {
1504             '_type': 'playlist',
1505             'id': query,
1506             'entries': []
1507         }
1508         for pagenum in itertools.count(0):
1509             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1510             webpage = self._download_webpage(result_url, query,
1511                                              note='Downloading results page '+str(pagenum+1))
1512             info = json.loads(webpage)
1513             m = info[u'm']
1514             results = info[u'results']
1515
1516             for (i, r) in enumerate(results):
1517                 if (pagenum * 30) +i >= n:
1518                     break
1519                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1520                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1521                 res['entries'].append(e)
1522             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1523                 break
1524
1525         return res
1526
1527
1528 class YoutubePlaylistIE(InfoExtractor):
1529     """Information Extractor for YouTube playlists."""
1530
1531     _VALID_URL = r"""(?:
1532                         (?:https?://)?
1533                         (?:\w+\.)?
1534                         youtube\.com/
1535                         (?:
1536                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1537                            \? (?:.*?&)*? (?:p|a|list)=
1538                         |  p/
1539                         )
1540                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1541                         .*
1542                      |
1543                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1544                      )"""
1545     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1546     _MAX_RESULTS = 50
1547     IE_NAME = u'youtube:playlist'
1548
1549     @classmethod
1550     def suitable(cls, url):
1551         """Receives a URL and returns True if suitable for this IE."""
1552         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1553
1554     def _real_extract(self, url):
1555         # Extract playlist id
1556         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1557         if mobj is None:
1558             raise ExtractorError(u'Invalid URL: %s' % url)
1559
1560         # Download playlist videos from API
1561         playlist_id = mobj.group(1) or mobj.group(2)
1562         page_num = 1
1563         videos = []
1564
1565         while True:
1566             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1567             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1568
1569             try:
1570                 response = json.loads(page)
1571             except ValueError as err:
1572                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1573
1574             if 'feed' not in response:
1575                 raise ExtractorError(u'Got a malformed response from YouTube API')
1576             playlist_title = response['feed']['title']['$t']
1577             if 'entry' not in response['feed']:
1578                 # Number of videos is a multiple of self._MAX_RESULTS
1579                 break
1580
1581             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1582                         for entry in response['feed']['entry']
1583                         if 'content' in entry ]
1584
1585             if len(response['feed']['entry']) < self._MAX_RESULTS:
1586                 break
1587             page_num += 1
1588
1589         videos = [v[1] for v in sorted(videos)]
1590
1591         url_results = [self.url_result(url, 'Youtube') for url in videos]
1592         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1593
1594
1595 class YoutubeChannelIE(InfoExtractor):
1596     """Information Extractor for YouTube channels."""
1597
1598     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1599     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1600     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1601     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1602     IE_NAME = u'youtube:channel'
1603
1604     def extract_videos_from_page(self, page):
1605         ids_in_page = []
1606         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1607             if mobj.group(1) not in ids_in_page:
1608                 ids_in_page.append(mobj.group(1))
1609         return ids_in_page
1610
1611     def _real_extract(self, url):
1612         # Extract channel id
1613         mobj = re.match(self._VALID_URL, url)
1614         if mobj is None:
1615             raise ExtractorError(u'Invalid URL: %s' % url)
1616
1617         # Download channel page
1618         channel_id = mobj.group(1)
1619         video_ids = []
1620         pagenum = 1
1621
1622         url = self._TEMPLATE_URL % (channel_id, pagenum)
1623         page = self._download_webpage(url, channel_id,
1624                                       u'Downloading page #%s' % pagenum)
1625
1626         # Extract video identifiers
1627         ids_in_page = self.extract_videos_from_page(page)
1628         video_ids.extend(ids_in_page)
1629
1630         # Download any subsequent channel pages using the json-based channel_ajax query
1631         if self._MORE_PAGES_INDICATOR in page:
1632             while True:
1633                 pagenum = pagenum + 1
1634
1635                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1636                 page = self._download_webpage(url, channel_id,
1637                                               u'Downloading page #%s' % pagenum)
1638
1639                 page = json.loads(page)
1640
1641                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1642                 video_ids.extend(ids_in_page)
1643
1644                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1645                     break
1646
1647         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1648
1649         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1650         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1651         return [self.playlist_result(url_entries, channel_id)]
1652
1653
1654 class YoutubeUserIE(InfoExtractor):
1655     """Information Extractor for YouTube users."""
1656
1657     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1658     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1659     _GDATA_PAGE_SIZE = 50
1660     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1661     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1662     IE_NAME = u'youtube:user'
1663
1664     def _real_extract(self, url):
1665         # Extract username
1666         mobj = re.match(self._VALID_URL, url)
1667         if mobj is None:
1668             raise ExtractorError(u'Invalid URL: %s' % url)
1669
1670         username = mobj.group(1)
1671
1672         # Download video ids using YouTube Data API. Result size per
1673         # query is limited (currently to 50 videos) so we need to query
1674         # page by page until there are no video ids - it means we got
1675         # all of them.
1676
1677         video_ids = []
1678         pagenum = 0
1679
1680         while True:
1681             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1682
1683             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1684             page = self._download_webpage(gdata_url, username,
1685                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1686
1687             # Extract video identifiers
1688             ids_in_page = []
1689
1690             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1691                 if mobj.group(1) not in ids_in_page:
1692                     ids_in_page.append(mobj.group(1))
1693
1694             video_ids.extend(ids_in_page)
1695
1696             # A little optimization - if current page is not
1697             # "full", ie. does not contain PAGE_SIZE video ids then
1698             # we can assume that this page is the last one - there
1699             # are no more ids on further pages - no need to query
1700             # again.
1701
1702             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1703                 break
1704
1705             pagenum += 1
1706
1707         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1708         url_results = [self.url_result(url, 'Youtube') for url in urls]
1709         return [self.playlist_result(url_results, playlist_title = username)]
1710
1711
1712 class BlipTVUserIE(InfoExtractor):
1713     """Information Extractor for blip.tv users."""
1714
1715     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1716     _PAGE_SIZE = 12
1717     IE_NAME = u'blip.tv:user'
1718
1719     def _real_extract(self, url):
1720         # Extract username
1721         mobj = re.match(self._VALID_URL, url)
1722         if mobj is None:
1723             raise ExtractorError(u'Invalid URL: %s' % url)
1724
1725         username = mobj.group(1)
1726
1727         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1728
1729         page = self._download_webpage(url, username, u'Downloading user page')
1730         mobj = re.search(r'data-users-id="([^"]+)"', page)
1731         page_base = page_base % mobj.group(1)
1732
1733
1734         # Download video ids using BlipTV Ajax calls. Result size per
1735         # query is limited (currently to 12 videos) so we need to query
1736         # page by page until there are no video ids - it means we got
1737         # all of them.
1738
1739         video_ids = []
1740         pagenum = 1
1741
1742         while True:
1743             url = page_base + "&page=" + str(pagenum)
1744             page = self._download_webpage(url, username,
1745                                           u'Downloading video ids from page %d' % pagenum)
1746
1747             # Extract video identifiers
1748             ids_in_page = []
1749
1750             for mobj in re.finditer(r'href="/([^"]+)"', page):
1751                 if mobj.group(1) not in ids_in_page:
1752                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1753
1754             video_ids.extend(ids_in_page)
1755
1756             # A little optimization - if current page is not
1757             # "full", ie. does not contain PAGE_SIZE video ids then
1758             # we can assume that this page is the last one - there
1759             # are no more ids on further pages - no need to query
1760             # again.
1761
1762             if len(ids_in_page) < self._PAGE_SIZE:
1763                 break
1764
1765             pagenum += 1
1766
1767         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1768         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1769         return [self.playlist_result(url_entries, playlist_title = username)]
1770
1771
1772 class DepositFilesIE(InfoExtractor):
1773     """Information extractor for depositfiles.com"""
1774
1775     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1776
1777     def _real_extract(self, url):
1778         file_id = url.split('/')[-1]
1779         # Rebuild url in english locale
1780         url = 'http://depositfiles.com/en/files/' + file_id
1781
1782         # Retrieve file webpage with 'Free download' button pressed
1783         free_download_indication = { 'gateway_result' : '1' }
1784         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1785         try:
1786             self.report_download_webpage(file_id)
1787             webpage = compat_urllib_request.urlopen(request).read()
1788         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1789             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1790
1791         # Search for the real file URL
1792         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1793         if (mobj is None) or (mobj.group(1) is None):
1794             # Try to figure out reason of the error.
1795             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1796             if (mobj is not None) and (mobj.group(1) is not None):
1797                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1798                 raise ExtractorError(u'%s' % restriction_message)
1799             else:
1800                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1801
1802         file_url = mobj.group(1)
1803         file_extension = os.path.splitext(file_url)[1][1:]
1804
1805         # Search for file title
1806         mobj = re.search(r'<b title="(.*?)">', webpage)
1807         if mobj is None:
1808             raise ExtractorError(u'Unable to extract title')
1809         file_title = mobj.group(1).decode('utf-8')
1810
1811         return [{
1812             'id':       file_id.decode('utf-8'),
1813             'url':      file_url.decode('utf-8'),
1814             'uploader': None,
1815             'upload_date':  None,
1816             'title':    file_title,
1817             'ext':      file_extension.decode('utf-8'),
1818         }]
1819
1820
1821 class FacebookIE(InfoExtractor):
1822     """Information Extractor for Facebook"""
1823
1824     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1825     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1826     _NETRC_MACHINE = 'facebook'
1827     IE_NAME = u'facebook'
1828
1829     def report_login(self):
1830         """Report attempt to log in."""
1831         self.to_screen(u'Logging in')
1832
1833     def _real_initialize(self):
1834         if self._downloader is None:
1835             return
1836
1837         useremail = None
1838         password = None
1839         downloader_params = self._downloader.params
1840
1841         # Attempt to use provided username and password or .netrc data
1842         if downloader_params.get('username', None) is not None:
1843             useremail = downloader_params['username']
1844             password = downloader_params['password']
1845         elif downloader_params.get('usenetrc', False):
1846             try:
1847                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1848                 if info is not None:
1849                     useremail = info[0]
1850                     password = info[2]
1851                 else:
1852                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1853             except (IOError, netrc.NetrcParseError) as err:
1854                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1855                 return
1856
1857         if useremail is None:
1858             return
1859
1860         # Log in
1861         login_form = {
1862             'email': useremail,
1863             'pass': password,
1864             'login': 'Log+In'
1865             }
1866         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1867         try:
1868             self.report_login()
1869             login_results = compat_urllib_request.urlopen(request).read()
1870             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1871                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1872                 return
1873         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1874             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1875             return
1876
1877     def _real_extract(self, url):
1878         mobj = re.match(self._VALID_URL, url)
1879         if mobj is None:
1880             raise ExtractorError(u'Invalid URL: %s' % url)
1881         video_id = mobj.group('ID')
1882
1883         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1884         webpage = self._download_webpage(url, video_id)
1885
1886         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1887         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1888         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1889         if not m:
1890             raise ExtractorError(u'Cannot parse data')
1891         data = dict(json.loads(m.group(1)))
1892         params_raw = compat_urllib_parse.unquote(data['params'])
1893         params = json.loads(params_raw)
1894         video_data = params['video_data'][0]
1895         video_url = video_data.get('hd_src')
1896         if not video_url:
1897             video_url = video_data['sd_src']
1898         if not video_url:
1899             raise ExtractorError(u'Cannot find video URL')
1900         video_duration = int(video_data['video_duration'])
1901         thumbnail = video_data['thumbnail_src']
1902
1903         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1904         if not m:
1905             raise ExtractorError(u'Cannot find title in webpage')
1906         video_title = unescapeHTML(m.group(1))
1907
1908         info = {
1909             'id': video_id,
1910             'title': video_title,
1911             'url': video_url,
1912             'ext': 'mp4',
1913             'duration': video_duration,
1914             'thumbnail': thumbnail,
1915         }
1916         return [info]
1917
1918
1919 class BlipTVIE(InfoExtractor):
1920     """Information extractor for blip.tv"""
1921
1922     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1923     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1924     IE_NAME = u'blip.tv'
1925
1926     def report_direct_download(self, title):
1927         """Report information extraction."""
1928         self.to_screen(u'%s: Direct download detected' % title)
1929
1930     def _real_extract(self, url):
1931         mobj = re.match(self._VALID_URL, url)
1932         if mobj is None:
1933             raise ExtractorError(u'Invalid URL: %s' % url)
1934
1935         # See https://github.com/rg3/youtube-dl/issues/857
1936         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1937         if api_mobj is not None:
1938             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1939         urlp = compat_urllib_parse_urlparse(url)
1940         if urlp.path.startswith('/play/'):
1941             request = compat_urllib_request.Request(url)
1942             response = compat_urllib_request.urlopen(request)
1943             redirecturl = response.geturl()
1944             rurlp = compat_urllib_parse_urlparse(redirecturl)
1945             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1946             url = 'http://blip.tv/a/a-' + file_id
1947             return self._real_extract(url)
1948
1949
1950         if '?' in url:
1951             cchar = '&'
1952         else:
1953             cchar = '?'
1954         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1955         request = compat_urllib_request.Request(json_url)
1956         request.add_header('User-Agent', 'iTunes/10.6.1')
1957         self.report_extraction(mobj.group(1))
1958         info = None
1959         try:
1960             urlh = compat_urllib_request.urlopen(request)
1961             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1962                 basename = url.split('/')[-1]
1963                 title,ext = os.path.splitext(basename)
1964                 title = title.decode('UTF-8')
1965                 ext = ext.replace('.', '')
1966                 self.report_direct_download(title)
1967                 info = {
1968                     'id': title,
1969                     'url': url,
1970                     'uploader': None,
1971                     'upload_date': None,
1972                     'title': title,
1973                     'ext': ext,
1974                     'urlhandle': urlh
1975                 }
1976         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1977             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1978         if info is None: # Regular URL
1979             try:
1980                 json_code_bytes = urlh.read()
1981                 json_code = json_code_bytes.decode('utf-8')
1982             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1983                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1984
1985             try:
1986                 json_data = json.loads(json_code)
1987                 if 'Post' in json_data:
1988                     data = json_data['Post']
1989                 else:
1990                     data = json_data
1991
1992                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1993                 video_url = data['media']['url']
1994                 umobj = re.match(self._URL_EXT, video_url)
1995                 if umobj is None:
1996                     raise ValueError('Can not determine filename extension')
1997                 ext = umobj.group(1)
1998
1999                 info = {
2000                     'id': data['item_id'],
2001                     'url': video_url,
2002                     'uploader': data['display_name'],
2003                     'upload_date': upload_date,
2004                     'title': data['title'],
2005                     'ext': ext,
2006                     'format': data['media']['mimeType'],
2007                     'thumbnail': data['thumbnailUrl'],
2008                     'description': data['description'],
2009                     'player_url': data['embedUrl'],
2010                     'user_agent': 'iTunes/10.6.1',
2011                 }
2012             except (ValueError,KeyError) as err:
2013                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2014
2015         return [info]
2016
2017
2018 class MyVideoIE(InfoExtractor):
2019     """Information Extractor for myvideo.de."""
2020
2021     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2022     IE_NAME = u'myvideo'
2023
2024     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2025     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2026     # https://github.com/rg3/youtube-dl/pull/842
2027     def __rc4crypt(self,data, key):
2028         x = 0
2029         box = list(range(256))
2030         for i in list(range(256)):
2031             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2032             box[i], box[x] = box[x], box[i]
2033         x = 0
2034         y = 0
2035         out = ''
2036         for char in data:
2037             x = (x + 1) % 256
2038             y = (y + box[x]) % 256
2039             box[x], box[y] = box[y], box[x]
2040             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2041         return out
2042
2043     def __md5(self,s):
2044         return hashlib.md5(s).hexdigest().encode()
2045
2046     def _real_extract(self,url):
2047         mobj = re.match(self._VALID_URL, url)
2048         if mobj is None:
2049             raise ExtractorError(u'invalid URL: %s' % url)
2050
2051         video_id = mobj.group(1)
2052
2053         GK = (
2054           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2055           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2056           b'TnpsbA0KTVRkbU1tSTRNdz09'
2057         )
2058
2059         # Get video webpage
2060         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2061         webpage = self._download_webpage(webpage_url, video_id)
2062
2063         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2064         if mobj is not None:
2065             self.report_extraction(video_id)
2066             video_url = mobj.group(1) + '.flv'
2067
2068             mobj = re.search('<title>([^<]+)</title>', webpage)
2069             if mobj is None:
2070                 raise ExtractorError(u'Unable to extract title')
2071             video_title = mobj.group(1)
2072
2073             mobj = re.search('[.](.+?)$', video_url)
2074             if mobj is None:
2075                 raise ExtractorError(u'Unable to extract extention')
2076             video_ext = mobj.group(1)
2077
2078             return [{
2079                 'id':       video_id,
2080                 'url':      video_url,
2081                 'uploader': None,
2082                 'upload_date':  None,
2083                 'title':    video_title,
2084                 'ext':      u'flv',
2085             }]
2086
2087         # try encxml
2088         mobj = re.search('var flashvars={(.+?)}', webpage)
2089         if mobj is None:
2090             raise ExtractorError(u'Unable to extract video')
2091
2092         params = {}
2093         encxml = ''
2094         sec = mobj.group(1)
2095         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2096             if not a == '_encxml':
2097                 params[a] = b
2098             else:
2099                 encxml = compat_urllib_parse.unquote(b)
2100         if not params.get('domain'):
2101             params['domain'] = 'www.myvideo.de'
2102         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2103         if 'flash_playertype=MTV' in xmldata_url:
2104             self._downloader.report_warning(u'avoiding MTV player')
2105             xmldata_url = (
2106                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2107                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2108             ) % video_id
2109
2110         # get enc data
2111         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2112         enc_data_b = binascii.unhexlify(enc_data)
2113         sk = self.__md5(
2114             base64.b64decode(base64.b64decode(GK)) +
2115             self.__md5(
2116                 str(video_id).encode('utf-8')
2117             )
2118         )
2119         dec_data = self.__rc4crypt(enc_data_b, sk)
2120
2121         # extracting infos
2122         self.report_extraction(video_id)
2123
2124         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2125         if mobj is None:
2126             raise ExtractorError(u'unable to extract rtmpurl')
2127         video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
2128         if 'myvideo2flash' in video_rtmpurl:
2129             self._downloader.report_warning(u'forcing RTMPT ...')
2130             video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
2131
2132         # extract non rtmp videos
2133         if (video_rtmpurl is None) or (video_rtmpurl == ''):
2134             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2135             if mobj is None:
2136                 raise ExtractorError(u'unable to extract url')
2137             video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2138
2139         mobj = re.search('source=\'(.*?)\'', dec_data)
2140         if mobj is None:
2141             raise ExtractorError(u'unable to extract swfobj')
2142         video_file     = compat_urllib_parse.unquote(mobj.group(1))
2143
2144         if not video_file.endswith('f4m'):
2145             ppath, prefix = video_file.split('.')
2146             video_playpath = '%s:%s' % (prefix, ppath)
2147             video_hls_playlist = ''
2148         else:
2149             video_playpath = ''
2150             video_hls_playlist = (
2151                 video_filepath + video_file
2152             ).replace('.f4m', '.m3u8')
2153
2154         mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
2155         if mobj is None:
2156             raise ExtractorError(u'unable to extract swfobj')
2157         video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
2158
2159         mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
2160         if mobj is None:
2161             raise ExtractorError(u'unable to extract title')
2162         video_title = mobj.group(1)
2163
2164         return [{
2165             'id':                 video_id,
2166             'url':                video_rtmpurl,
2167             'tc_url':             video_rtmpurl,
2168             'uploader':           None,
2169             'upload_date':        None,
2170             'title':              video_title,
2171             'ext':                u'flv',
2172             'play_path':          video_playpath,
2173             'video_file':         video_file,
2174             'video_hls_playlist': video_hls_playlist,
2175             'player_url':         video_swfobj,
2176         }]
2177
2178 class ComedyCentralIE(InfoExtractor):
2179     """Information extractor for The Daily Show and Colbert Report """
2180
2181     # urls can be abbreviations like :thedailyshow or :colbert
2182     # urls for episodes like:
2183     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2184     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2185     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2186     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2187                       |(https?://)?(www\.)?
2188                           (?P<showname>thedailyshow|colbertnation)\.com/
2189                          (full-episodes/(?P<episode>.*)|
2190                           (?P<clip>
2191                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2192                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2193                      $"""
2194
2195     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2196
2197     _video_extensions = {
2198         '3500': 'mp4',
2199         '2200': 'mp4',
2200         '1700': 'mp4',
2201         '1200': 'mp4',
2202         '750': 'mp4',
2203         '400': 'mp4',
2204     }
2205     _video_dimensions = {
2206         '3500': '1280x720',
2207         '2200': '960x540',
2208         '1700': '768x432',
2209         '1200': '640x360',
2210         '750': '512x288',
2211         '400': '384x216',
2212     }
2213
2214     @classmethod
2215     def suitable(cls, url):
2216         """Receives a URL and returns True if suitable for this IE."""
2217         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2218
2219     def _print_formats(self, formats):
2220         print('Available formats:')
2221         for x in formats:
2222             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2223
2224
2225     def _real_extract(self, url):
2226         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2227         if mobj is None:
2228             raise ExtractorError(u'Invalid URL: %s' % url)
2229
2230         if mobj.group('shortname'):
2231             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2232                 url = u'http://www.thedailyshow.com/full-episodes/'
2233             else:
2234                 url = u'http://www.colbertnation.com/full-episodes/'
2235             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2236             assert mobj is not None
2237
2238         if mobj.group('clip'):
2239             if mobj.group('showname') == 'thedailyshow':
2240                 epTitle = mobj.group('tdstitle')
2241             else:
2242                 epTitle = mobj.group('cntitle')
2243             dlNewest = False
2244         else:
2245             dlNewest = not mobj.group('episode')
2246             if dlNewest:
2247                 epTitle = mobj.group('showname')
2248             else:
2249                 epTitle = mobj.group('episode')
2250
2251         self.report_extraction(epTitle)
2252         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2253         if dlNewest:
2254             url = htmlHandle.geturl()
2255             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2256             if mobj is None:
2257                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2258             if mobj.group('episode') == '':
2259                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2260             epTitle = mobj.group('episode')
2261
2262         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2263
2264         if len(mMovieParams) == 0:
2265             # The Colbert Report embeds the information in a without
2266             # a URL prefix; so extract the alternate reference
2267             # and then add the URL prefix manually.
2268
2269             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2270             if len(altMovieParams) == 0:
2271                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2272             else:
2273                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2274
2275         uri = mMovieParams[0][1]
2276         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2277         indexXml = self._download_webpage(indexUrl, epTitle,
2278                                           u'Downloading show index',
2279                                           u'unable to download episode index')
2280
2281         results = []
2282
2283         idoc = xml.etree.ElementTree.fromstring(indexXml)
2284         itemEls = idoc.findall('.//item')
2285         for partNum,itemEl in enumerate(itemEls):
2286             mediaId = itemEl.findall('./guid')[0].text
2287             shortMediaId = mediaId.split(':')[-1]
2288             showId = mediaId.split(':')[-2].replace('.com', '')
2289             officialTitle = itemEl.findall('./title')[0].text
2290             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2291
2292             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2293                         compat_urllib_parse.urlencode({'uri': mediaId}))
2294             configXml = self._download_webpage(configUrl, epTitle,
2295                                                u'Downloading configuration for %s' % shortMediaId)
2296
2297             cdoc = xml.etree.ElementTree.fromstring(configXml)
2298             turls = []
2299             for rendition in cdoc.findall('.//rendition'):
2300                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2301                 turls.append(finfo)
2302
2303             if len(turls) == 0:
2304                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2305                 continue
2306
2307             if self._downloader.params.get('listformats', None):
2308                 self._print_formats([i[0] for i in turls])
2309                 return
2310
2311             # For now, just pick the highest bitrate
2312             format,rtmp_video_url = turls[-1]
2313
2314             # Get the format arg from the arg stream
2315             req_format = self._downloader.params.get('format', None)
2316
2317             # Select format if we can find one
2318             for f,v in turls:
2319                 if f == req_format:
2320                     format, rtmp_video_url = f, v
2321                     break
2322
2323             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2324             if not m:
2325                 raise ExtractorError(u'Cannot transform RTMP url')
2326             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2327             video_url = base + m.group('finalid')
2328
2329             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2330             info = {
2331                 'id': shortMediaId,
2332                 'url': video_url,
2333                 'uploader': showId,
2334                 'upload_date': officialDate,
2335                 'title': effTitle,
2336                 'ext': 'mp4',
2337                 'format': format,
2338                 'thumbnail': None,
2339                 'description': officialTitle,
2340             }
2341             results.append(info)
2342
2343         return results
2344
2345
2346 class EscapistIE(InfoExtractor):
2347     """Information extractor for The Escapist """
2348
2349     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2350     IE_NAME = u'escapist'
2351
2352     def _real_extract(self, url):
2353         mobj = re.match(self._VALID_URL, url)
2354         if mobj is None:
2355             raise ExtractorError(u'Invalid URL: %s' % url)
2356         showName = mobj.group('showname')
2357         videoId = mobj.group('episode')
2358
2359         self.report_extraction(showName)
2360         webPage = self._download_webpage(url, showName)
2361
2362         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2363         description = unescapeHTML(descMatch.group(1))
2364         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2365         imgUrl = unescapeHTML(imgMatch.group(1))
2366         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2367         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2368         configUrlMatch = re.search('config=(.*)$', playerUrl)
2369         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2370
2371         configJSON = self._download_webpage(configUrl, showName,
2372                                             u'Downloading configuration',
2373                                             u'unable to download configuration')
2374
2375         # Technically, it's JavaScript, not JSON
2376         configJSON = configJSON.replace("'", '"')
2377
2378         try:
2379             config = json.loads(configJSON)
2380         except (ValueError,) as err:
2381             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2382
2383         playlist = config['playlist']
2384         videoUrl = playlist[1]['url']
2385
2386         info = {
2387             'id': videoId,
2388             'url': videoUrl,
2389             'uploader': showName,
2390             'upload_date': None,
2391             'title': showName,
2392             'ext': 'mp4',
2393             'thumbnail': imgUrl,
2394             'description': description,
2395             'player_url': playerUrl,
2396         }
2397
2398         return [info]
2399
2400 class CollegeHumorIE(InfoExtractor):
2401     """Information extractor for collegehumor.com"""
2402
2403     _WORKING = False
2404     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2405     IE_NAME = u'collegehumor'
2406
2407     def report_manifest(self, video_id):
2408         """Report information extraction."""
2409         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2410
2411     def _real_extract(self, url):
2412         mobj = re.match(self._VALID_URL, url)
2413         if mobj is None:
2414             raise ExtractorError(u'Invalid URL: %s' % url)
2415         video_id = mobj.group('videoid')
2416
2417         info = {
2418             'id': video_id,
2419             'uploader': None,
2420             'upload_date': None,
2421         }
2422
2423         self.report_extraction(video_id)
2424         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2425         try:
2426             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2427         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2428             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2429
2430         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2431         try:
2432             videoNode = mdoc.findall('./video')[0]
2433             info['description'] = videoNode.findall('./description')[0].text
2434             info['title'] = videoNode.findall('./caption')[0].text
2435             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2436             manifest_url = videoNode.findall('./file')[0].text
2437         except IndexError:
2438             raise ExtractorError(u'Invalid metadata XML file')
2439
2440         manifest_url += '?hdcore=2.10.3'
2441         self.report_manifest(video_id)
2442         try:
2443             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2444         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2445             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2446
2447         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2448         try:
2449             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2450             node_id = media_node.attrib['url']
2451             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2452         except IndexError as err:
2453             raise ExtractorError(u'Invalid manifest file')
2454
2455         url_pr = compat_urllib_parse_urlparse(manifest_url)
2456         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2457
2458         info['url'] = url
2459         info['ext'] = 'f4f'
2460         return [info]
2461
2462
2463 class XVideosIE(InfoExtractor):
2464     """Information extractor for xvideos.com"""
2465
2466     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2467     IE_NAME = u'xvideos'
2468
2469     def _real_extract(self, url):
2470         mobj = re.match(self._VALID_URL, url)
2471         if mobj is None:
2472             raise ExtractorError(u'Invalid URL: %s' % url)
2473         video_id = mobj.group(1)
2474
2475         webpage = self._download_webpage(url, video_id)
2476
2477         self.report_extraction(video_id)
2478
2479
2480         # Extract video URL
2481         mobj = re.search(r'flv_url=(.+?)&', webpage)
2482         if mobj is None:
2483             raise ExtractorError(u'Unable to extract video url')
2484         video_url = compat_urllib_parse.unquote(mobj.group(1))
2485
2486
2487         # Extract title
2488         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2489         if mobj is None:
2490             raise ExtractorError(u'Unable to extract video title')
2491         video_title = mobj.group(1)
2492
2493
2494         # Extract video thumbnail
2495         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2496         if mobj is None:
2497             raise ExtractorError(u'Unable to extract video thumbnail')
2498         video_thumbnail = mobj.group(0)
2499
2500         info = {
2501             'id': video_id,
2502             'url': video_url,
2503             'uploader': None,
2504             'upload_date': None,
2505             'title': video_title,
2506             'ext': 'flv',
2507             'thumbnail': video_thumbnail,
2508             'description': None,
2509         }
2510
2511         return [info]
2512
2513
2514 class SoundcloudIE(InfoExtractor):
2515     """Information extractor for soundcloud.com
2516        To access the media, the uid of the song and a stream token
2517        must be extracted from the page source and the script must make
2518        a request to media.soundcloud.com/crossdomain.xml. Then
2519        the media can be grabbed by requesting from an url composed
2520        of the stream token and uid
2521      """
2522
2523     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2524     IE_NAME = u'soundcloud'
2525
2526     def report_resolve(self, video_id):
2527         """Report information extraction."""
2528         self.to_screen(u'%s: Resolving id' % video_id)
2529
2530     def _real_extract(self, url):
2531         mobj = re.match(self._VALID_URL, url)
2532         if mobj is None:
2533             raise ExtractorError(u'Invalid URL: %s' % url)
2534
2535         # extract uploader (which is in the url)
2536         uploader = mobj.group(1)
2537         # extract simple title (uploader + slug of song title)
2538         slug_title =  mobj.group(2)
2539         simple_title = uploader + u'-' + slug_title
2540         full_title = '%s/%s' % (uploader, slug_title)
2541
2542         self.report_resolve(full_title)
2543
2544         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2545         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2546         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2547
2548         info = json.loads(info_json)
2549         video_id = info['id']
2550         self.report_extraction(full_title)
2551
2552         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2553         stream_json = self._download_webpage(streams_url, full_title,
2554                                              u'Downloading stream definitions',
2555                                              u'unable to download stream definitions')
2556
2557         streams = json.loads(stream_json)
2558         mediaURL = streams['http_mp3_128_url']
2559         upload_date = unified_strdate(info['created_at'])
2560
2561         return [{
2562             'id':       info['id'],
2563             'url':      mediaURL,
2564             'uploader': info['user']['username'],
2565             'upload_date': upload_date,
2566             'title':    info['title'],
2567             'ext':      u'mp3',
2568             'description': info['description'],
2569         }]
2570
2571 class SoundcloudSetIE(InfoExtractor):
2572     """Information extractor for soundcloud.com sets
2573        To access the media, the uid of the song and a stream token
2574        must be extracted from the page source and the script must make
2575        a request to media.soundcloud.com/crossdomain.xml. Then
2576        the media can be grabbed by requesting from an url composed
2577        of the stream token and uid
2578      """
2579
2580     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2581     IE_NAME = u'soundcloud:set'
2582
2583     def report_resolve(self, video_id):
2584         """Report information extraction."""
2585         self.to_screen(u'%s: Resolving id' % video_id)
2586
2587     def _real_extract(self, url):
2588         mobj = re.match(self._VALID_URL, url)
2589         if mobj is None:
2590             raise ExtractorError(u'Invalid URL: %s' % url)
2591
2592         # extract uploader (which is in the url)
2593         uploader = mobj.group(1)
2594         # extract simple title (uploader + slug of song title)
2595         slug_title =  mobj.group(2)
2596         simple_title = uploader + u'-' + slug_title
2597         full_title = '%s/sets/%s' % (uploader, slug_title)
2598
2599         self.report_resolve(full_title)
2600
2601         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2602         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2603         info_json = self._download_webpage(resolv_url, full_title)
2604
2605         videos = []
2606         info = json.loads(info_json)
2607         if 'errors' in info:
2608             for err in info['errors']:
2609                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2610             return
2611
2612         self.report_extraction(full_title)
2613         for track in info['tracks']:
2614             video_id = track['id']
2615
2616             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2617             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2618
2619             self.report_extraction(video_id)
2620             streams = json.loads(stream_json)
2621             mediaURL = streams['http_mp3_128_url']
2622
2623             videos.append({
2624                 'id':       video_id,
2625                 'url':      mediaURL,
2626                 'uploader': track['user']['username'],
2627                 'upload_date':  unified_strdate(track['created_at']),
2628                 'title':    track['title'],
2629                 'ext':      u'mp3',
2630                 'description': track['description'],
2631             })
2632         return videos
2633
2634
2635 class InfoQIE(InfoExtractor):
2636     """Information extractor for infoq.com"""
2637     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2638
2639     def _real_extract(self, url):
2640         mobj = re.match(self._VALID_URL, url)
2641         if mobj is None:
2642             raise ExtractorError(u'Invalid URL: %s' % url)
2643
2644         webpage = self._download_webpage(url, video_id=url)
2645         self.report_extraction(url)
2646
2647         # Extract video URL
2648         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2649         if mobj is None:
2650             raise ExtractorError(u'Unable to extract video url')
2651         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2652         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2653
2654         # Extract title
2655         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2656         if mobj is None:
2657             raise ExtractorError(u'Unable to extract video title')
2658         video_title = mobj.group(1)
2659
2660         # Extract description
2661         video_description = u'No description available.'
2662         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2663         if mobj is not None:
2664             video_description = mobj.group(1)
2665
2666         video_filename = video_url.split('/')[-1]
2667         video_id, extension = video_filename.split('.')
2668
2669         info = {
2670             'id': video_id,
2671             'url': video_url,
2672             'uploader': None,
2673             'upload_date': None,
2674             'title': video_title,
2675             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2676             'thumbnail': None,
2677             'description': video_description,
2678         }
2679
2680         return [info]
2681
2682 class MixcloudIE(InfoExtractor):
2683     """Information extractor for www.mixcloud.com"""
2684
2685     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2686     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2687     IE_NAME = u'mixcloud'
2688
2689     def report_download_json(self, file_id):
2690         """Report JSON download."""
2691         self.to_screen(u'Downloading json')
2692
2693     def get_urls(self, jsonData, fmt, bitrate='best'):
2694         """Get urls from 'audio_formats' section in json"""
2695         file_url = None
2696         try:
2697             bitrate_list = jsonData[fmt]
2698             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2699                 bitrate = max(bitrate_list) # select highest
2700
2701             url_list = jsonData[fmt][bitrate]
2702         except TypeError: # we have no bitrate info.
2703             url_list = jsonData[fmt]
2704         return url_list
2705
2706     def check_urls(self, url_list):
2707         """Returns 1st active url from list"""
2708         for url in url_list:
2709             try:
2710                 compat_urllib_request.urlopen(url)
2711                 return url
2712             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2713                 url = None
2714
2715         return None
2716
2717     def _print_formats(self, formats):
2718         print('Available formats:')
2719         for fmt in formats.keys():
2720             for b in formats[fmt]:
2721                 try:
2722                     ext = formats[fmt][b][0]
2723                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2724                 except TypeError: # we have no bitrate info
2725                     ext = formats[fmt][0]
2726                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2727                     break
2728
2729     def _real_extract(self, url):
2730         mobj = re.match(self._VALID_URL, url)
2731         if mobj is None:
2732             raise ExtractorError(u'Invalid URL: %s' % url)
2733         # extract uploader & filename from url
2734         uploader = mobj.group(1).decode('utf-8')
2735         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2736
2737         # construct API request
2738         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2739         # retrieve .json file with links to files
2740         request = compat_urllib_request.Request(file_url)
2741         try:
2742             self.report_download_json(file_url)
2743             jsonData = compat_urllib_request.urlopen(request).read()
2744         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2745             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2746
2747         # parse JSON
2748         json_data = json.loads(jsonData)
2749         player_url = json_data['player_swf_url']
2750         formats = dict(json_data['audio_formats'])
2751
2752         req_format = self._downloader.params.get('format', None)
2753         bitrate = None
2754
2755         if self._downloader.params.get('listformats', None):
2756             self._print_formats(formats)
2757             return
2758
2759         if req_format is None or req_format == 'best':
2760             for format_param in formats.keys():
2761                 url_list = self.get_urls(formats, format_param)
2762                 # check urls
2763                 file_url = self.check_urls(url_list)
2764                 if file_url is not None:
2765                     break # got it!
2766         else:
2767             if req_format not in formats:
2768                 raise ExtractorError(u'Format is not available')
2769
2770             url_list = self.get_urls(formats, req_format)
2771             file_url = self.check_urls(url_list)
2772             format_param = req_format
2773
2774         return [{
2775             'id': file_id.decode('utf-8'),
2776             'url': file_url.decode('utf-8'),
2777             'uploader': uploader.decode('utf-8'),
2778             'upload_date': None,
2779             'title': json_data['name'],
2780             'ext': file_url.split('.')[-1].decode('utf-8'),
2781             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2782             'thumbnail': json_data['thumbnail_url'],
2783             'description': json_data['description'],
2784             'player_url': player_url.decode('utf-8'),
2785         }]
2786
2787 class StanfordOpenClassroomIE(InfoExtractor):
2788     """Information extractor for Stanford's Open ClassRoom"""
2789
2790     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2791     IE_NAME = u'stanfordoc'
2792
2793     def _real_extract(self, url):
2794         mobj = re.match(self._VALID_URL, url)
2795         if mobj is None:
2796             raise ExtractorError(u'Invalid URL: %s' % url)
2797
2798         if mobj.group('course') and mobj.group('video'): # A specific video
2799             course = mobj.group('course')
2800             video = mobj.group('video')
2801             info = {
2802                 'id': course + '_' + video,
2803                 'uploader': None,
2804                 'upload_date': None,
2805             }
2806
2807             self.report_extraction(info['id'])
2808             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2809             xmlUrl = baseUrl + video + '.xml'
2810             try:
2811                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2812             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2813                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2814             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2815             try:
2816                 info['title'] = mdoc.findall('./title')[0].text
2817                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2818             except IndexError:
2819                 raise ExtractorError(u'Invalid metadata XML file')
2820             info['ext'] = info['url'].rpartition('.')[2]
2821             return [info]
2822         elif mobj.group('course'): # A course page
2823             course = mobj.group('course')
2824             info = {
2825                 'id': course,
2826                 'type': 'playlist',
2827                 'uploader': None,
2828                 'upload_date': None,
2829             }
2830
2831             coursepage = self._download_webpage(url, info['id'],
2832                                         note='Downloading course info page',
2833                                         errnote='Unable to download course info page')
2834
2835             m = re.search('<h1>([^<]+)</h1>', coursepage)
2836             if m:
2837                 info['title'] = unescapeHTML(m.group(1))
2838             else:
2839                 info['title'] = info['id']
2840
2841             m = re.search('<description>([^<]+)</description>', coursepage)
2842             if m:
2843                 info['description'] = unescapeHTML(m.group(1))
2844
2845             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2846             info['list'] = [
2847                 {
2848                     'type': 'reference',
2849                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2850                 }
2851                     for vpage in links]
2852             results = []
2853             for entry in info['list']:
2854                 assert entry['type'] == 'reference'
2855                 results += self.extract(entry['url'])
2856             return results
2857         else: # Root page
2858             info = {
2859                 'id': 'Stanford OpenClassroom',
2860                 'type': 'playlist',
2861                 'uploader': None,
2862                 'upload_date': None,
2863             }
2864
2865             self.report_download_webpage(info['id'])
2866             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2867             try:
2868                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2869             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2870                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2871
2872             info['title'] = info['id']
2873
2874             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2875             info['list'] = [
2876                 {
2877                     'type': 'reference',
2878                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2879                 }
2880                     for cpage in links]
2881
2882             results = []
2883             for entry in info['list']:
2884                 assert entry['type'] == 'reference'
2885                 results += self.extract(entry['url'])
2886             return results
2887
2888 class MTVIE(InfoExtractor):
2889     """Information extractor for MTV.com"""
2890
2891     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2892     IE_NAME = u'mtv'
2893
2894     def _real_extract(self, url):
2895         mobj = re.match(self._VALID_URL, url)
2896         if mobj is None:
2897             raise ExtractorError(u'Invalid URL: %s' % url)
2898         if not mobj.group('proto'):
2899             url = 'http://' + url
2900         video_id = mobj.group('videoid')
2901
2902         webpage = self._download_webpage(url, video_id)
2903
2904         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2905         if mobj is None:
2906             raise ExtractorError(u'Unable to extract song name')
2907         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2908         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2909         if mobj is None:
2910             raise ExtractorError(u'Unable to extract performer')
2911         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2912         video_title = performer + ' - ' + song_name
2913
2914         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2915         if mobj is None:
2916             raise ExtractorError(u'Unable to mtvn_uri')
2917         mtvn_uri = mobj.group(1)
2918
2919         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2920         if mobj is None:
2921             raise ExtractorError(u'Unable to extract content id')
2922         content_id = mobj.group(1)
2923
2924         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2925         self.report_extraction(video_id)
2926         request = compat_urllib_request.Request(videogen_url)
2927         try:
2928             metadataXml = compat_urllib_request.urlopen(request).read()
2929         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2930             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2931
2932         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2933         renditions = mdoc.findall('.//rendition')
2934
2935         # For now, always pick the highest quality.
2936         rendition = renditions[-1]
2937
2938         try:
2939             _,_,ext = rendition.attrib['type'].partition('/')
2940             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2941             video_url = rendition.find('./src').text
2942         except KeyError:
2943             raise ExtractorError('Invalid rendition field.')
2944
2945         info = {
2946             'id': video_id,
2947             'url': video_url,
2948             'uploader': performer,
2949             'upload_date': None,
2950             'title': video_title,
2951             'ext': ext,
2952             'format': format,
2953         }
2954
2955         return [info]
2956
2957
2958 class YoukuIE(InfoExtractor):
2959     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2960
2961     def _gen_sid(self):
2962         nowTime = int(time.time() * 1000)
2963         random1 = random.randint(1000,1998)
2964         random2 = random.randint(1000,9999)
2965
2966         return "%d%d%d" %(nowTime,random1,random2)
2967
2968     def _get_file_ID_mix_string(self, seed):
2969         mixed = []
2970         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2971         seed = float(seed)
2972         for i in range(len(source)):
2973             seed  =  (seed * 211 + 30031 ) % 65536
2974             index  =  math.floor(seed / 65536 * len(source) )
2975             mixed.append(source[int(index)])
2976             source.remove(source[int(index)])
2977         #return ''.join(mixed)
2978         return mixed
2979
2980     def _get_file_id(self, fileId, seed):
2981         mixed = self._get_file_ID_mix_string(seed)
2982         ids = fileId.split('*')
2983         realId = []
2984         for ch in ids:
2985             if ch:
2986                 realId.append(mixed[int(ch)])
2987         return ''.join(realId)
2988
2989     def _real_extract(self, url):
2990         mobj = re.match(self._VALID_URL, url)
2991         if mobj is None:
2992             raise ExtractorError(u'Invalid URL: %s' % url)
2993         video_id = mobj.group('ID')
2994
2995         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2996
2997         jsondata = self._download_webpage(info_url, video_id)
2998
2999         self.report_extraction(video_id)
3000         try:
3001             config = json.loads(jsondata)
3002
3003             video_title =  config['data'][0]['title']
3004             seed = config['data'][0]['seed']
3005
3006             format = self._downloader.params.get('format', None)
3007             supported_format = list(config['data'][0]['streamfileids'].keys())
3008
3009             if format is None or format == 'best':
3010                 if 'hd2' in supported_format:
3011                     format = 'hd2'
3012                 else:
3013                     format = 'flv'
3014                 ext = u'flv'
3015             elif format == 'worst':
3016                 format = 'mp4'
3017                 ext = u'mp4'
3018             else:
3019                 format = 'flv'
3020                 ext = u'flv'
3021
3022
3023             fileid = config['data'][0]['streamfileids'][format]
3024             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3025         except (UnicodeDecodeError, ValueError, KeyError):
3026             raise ExtractorError(u'Unable to extract info section')
3027
3028         files_info=[]
3029         sid = self._gen_sid()
3030         fileid = self._get_file_id(fileid, seed)
3031
3032         #column 8,9 of fileid represent the segment number
3033         #fileid[7:9] should be changed
3034         for index, key in enumerate(keys):
3035
3036             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3037             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3038
3039             info = {
3040                 'id': '%s_part%02d' % (video_id, index),
3041                 'url': download_url,
3042                 'uploader': None,
3043                 'upload_date': None,
3044                 'title': video_title,
3045                 'ext': ext,
3046             }
3047             files_info.append(info)
3048
3049         return files_info
3050
3051
3052 class XNXXIE(InfoExtractor):
3053     """Information extractor for xnxx.com"""
3054
3055     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3056     IE_NAME = u'xnxx'
3057     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3058     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3059     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3060
3061     def _real_extract(self, url):
3062         mobj = re.match(self._VALID_URL, url)
3063         if mobj is None:
3064             raise ExtractorError(u'Invalid URL: %s' % url)
3065         video_id = mobj.group(1)
3066
3067         # Get webpage content
3068         webpage = self._download_webpage(url, video_id)
3069
3070         result = re.search(self.VIDEO_URL_RE, webpage)
3071         if result is None:
3072             raise ExtractorError(u'Unable to extract video url')
3073         video_url = compat_urllib_parse.unquote(result.group(1))
3074
3075         result = re.search(self.VIDEO_TITLE_RE, webpage)
3076         if result is None:
3077             raise ExtractorError(u'Unable to extract video title')
3078         video_title = result.group(1)
3079
3080         result = re.search(self.VIDEO_THUMB_RE, webpage)
3081         if result is None:
3082             raise ExtractorError(u'Unable to extract video thumbnail')
3083         video_thumbnail = result.group(1)
3084
3085         return [{
3086             'id': video_id,
3087             'url': video_url,
3088             'uploader': None,
3089             'upload_date': None,
3090             'title': video_title,
3091             'ext': 'flv',
3092             'thumbnail': video_thumbnail,
3093             'description': None,
3094         }]
3095
3096
3097 class GooglePlusIE(InfoExtractor):
3098     """Information extractor for plus.google.com."""
3099
3100     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3101     IE_NAME = u'plus.google'
3102
3103     def report_extract_entry(self, url):
3104         """Report downloading extry"""
3105         self.to_screen(u'Downloading entry: %s' % url)
3106
3107     def report_date(self, upload_date):
3108         """Report downloading extry"""
3109         self.to_screen(u'Entry date: %s' % upload_date)
3110
3111     def report_uploader(self, uploader):
3112         """Report downloading extry"""
3113         self.to_screen(u'Uploader: %s' % uploader)
3114
3115     def report_title(self, video_title):
3116         """Report downloading extry"""
3117         self.to_screen(u'Title: %s' % video_title)
3118
3119     def report_extract_vid_page(self, video_page):
3120         """Report information extraction."""
3121         self.to_screen(u'Extracting video page: %s' % video_page)
3122
3123     def _real_extract(self, url):
3124         # Extract id from URL
3125         mobj = re.match(self._VALID_URL, url)
3126         if mobj is None:
3127             raise ExtractorError(u'Invalid URL: %s' % url)
3128
3129         post_url = mobj.group(0)
3130         video_id = mobj.group(1)
3131
3132         video_extension = 'flv'
3133
3134         # Step 1, Retrieve post webpage to extract further information
3135         self.report_extract_entry(post_url)
3136         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3137
3138         # Extract update date
3139         upload_date = None
3140         pattern = 'title="Timestamp">(.*?)</a>'
3141         mobj = re.search(pattern, webpage)
3142         if mobj:
3143             upload_date = mobj.group(1)
3144             # Convert timestring to a format suitable for filename
3145             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3146             upload_date = upload_date.strftime('%Y%m%d')
3147         self.report_date(upload_date)
3148
3149         # Extract uploader
3150         uploader = None
3151         pattern = r'rel\="author".*?>(.*?)</a>'
3152         mobj = re.search(pattern, webpage)
3153         if mobj:
3154             uploader = mobj.group(1)
3155         self.report_uploader(uploader)
3156
3157         # Extract title
3158         # Get the first line for title
3159         video_title = u'NA'
3160         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3161         mobj = re.search(pattern, webpage)
3162         if mobj:
3163             video_title = mobj.group(1)
3164         self.report_title(video_title)
3165
3166         # Step 2, Stimulate clicking the image box to launch video
3167         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3168         mobj = re.search(pattern, webpage)
3169         if mobj is None:
3170             raise ExtractorError(u'Unable to extract video page URL')
3171
3172         video_page = mobj.group(1)
3173         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3174         self.report_extract_vid_page(video_page)
3175
3176
3177         # Extract video links on video page
3178         """Extract video links of all sizes"""
3179         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3180         mobj = re.findall(pattern, webpage)
3181         if len(mobj) == 0:
3182             raise ExtractorError(u'Unable to extract video links')
3183
3184         # Sort in resolution
3185         links = sorted(mobj)
3186
3187         # Choose the lowest of the sort, i.e. highest resolution
3188         video_url = links[-1]
3189         # Only get the url. The resolution part in the tuple has no use anymore
3190         video_url = video_url[-1]
3191         # Treat escaped \u0026 style hex
3192         try:
3193             video_url = video_url.decode("unicode_escape")
3194         except AttributeError: # Python 3
3195             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3196
3197
3198         return [{
3199             'id':       video_id,
3200             'url':      video_url,
3201             'uploader': uploader,
3202             'upload_date':  upload_date,
3203             'title':    video_title,
3204             'ext':      video_extension,
3205         }]
3206
3207 class NBAIE(InfoExtractor):
3208     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3209     IE_NAME = u'nba'
3210
3211     def _real_extract(self, url):
3212         mobj = re.match(self._VALID_URL, url)
3213         if mobj is None:
3214             raise ExtractorError(u'Invalid URL: %s' % url)
3215
3216         video_id = mobj.group(1)
3217         if video_id.endswith('/index.html'):
3218             video_id = video_id[:-len('/index.html')]
3219
3220         webpage = self._download_webpage(url, video_id)
3221
3222         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3223         def _findProp(rexp, default=None):
3224             m = re.search(rexp, webpage)
3225             if m:
3226                 return unescapeHTML(m.group(1))
3227             else:
3228                 return default
3229
3230         shortened_video_id = video_id.rpartition('/')[2]
3231         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3232         info = {
3233             'id': shortened_video_id,
3234             'url': video_url,
3235             'ext': 'mp4',
3236             'title': title,
3237             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3238             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3239         }
3240         return [info]
3241
3242 class JustinTVIE(InfoExtractor):
3243     """Information extractor for justin.tv and twitch.tv"""
3244     # TODO: One broadcast may be split into multiple videos. The key
3245     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3246     # starts at 1 and increases. Can we treat all parts as one video?
3247
3248     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3249         (?:
3250             (?P<channelid>[^/]+)|
3251             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3252             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3253         )
3254         /?(?:\#.*)?$
3255         """
3256     _JUSTIN_PAGE_LIMIT = 100
3257     IE_NAME = u'justin.tv'
3258
3259     def report_download_page(self, channel, offset):
3260         """Report attempt to download a single page of videos."""
3261         self.to_screen(u'%s: Downloading video information from %d to %d' %
3262                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3263
3264     # Return count of items, list of *valid* items
3265     def _parse_page(self, url, video_id):
3266         webpage = self._download_webpage(url, video_id,
3267                                          u'Downloading video info JSON',
3268                                          u'unable to download video info JSON')
3269
3270         response = json.loads(webpage)
3271         if type(response) != list:
3272             error_text = response.get('error', 'unknown error')
3273             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3274         info = []
3275         for clip in response:
3276             video_url = clip['video_file_url']
3277             if video_url:
3278                 video_extension = os.path.splitext(video_url)[1][1:]
3279                 video_date = re.sub('-', '', clip['start_time'][:10])
3280                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3281                 video_id = clip['id']
3282                 video_title = clip.get('title', video_id)
3283                 info.append({
3284                     'id': video_id,
3285                     'url': video_url,
3286                     'title': video_title,
3287                     'uploader': clip.get('channel_name', video_uploader_id),
3288                     'uploader_id': video_uploader_id,
3289                     'upload_date': video_date,
3290                     'ext': video_extension,
3291                 })
3292         return (len(response), info)
3293
3294     def _real_extract(self, url):
3295         mobj = re.match(self._VALID_URL, url)
3296         if mobj is None:
3297             raise ExtractorError(u'invalid URL: %s' % url)
3298
3299         api_base = 'http://api.justin.tv'
3300         paged = False
3301         if mobj.group('channelid'):
3302             paged = True
3303             video_id = mobj.group('channelid')
3304             api = api_base + '/channel/archives/%s.json' % video_id
3305         elif mobj.group('chapterid'):
3306             chapter_id = mobj.group('chapterid')
3307
3308             webpage = self._download_webpage(url, chapter_id)
3309             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3310             if not m:
3311                 raise ExtractorError(u'Cannot find archive of a chapter')
3312             archive_id = m.group(1)
3313
3314             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3315             chapter_info_xml = self._download_webpage(api, chapter_id,
3316                                              note=u'Downloading chapter information',
3317                                              errnote=u'Chapter information download failed')
3318             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3319             for a in doc.findall('.//archive'):
3320                 if archive_id == a.find('./id').text:
3321                     break
3322             else:
3323                 raise ExtractorError(u'Could not find chapter in chapter information')
3324
3325             video_url = a.find('./video_file_url').text
3326             video_ext = video_url.rpartition('.')[2] or u'flv'
3327
3328             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3329             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3330                                    note='Downloading chapter metadata',
3331                                    errnote='Download of chapter metadata failed')
3332             chapter_info = json.loads(chapter_info_json)
3333
3334             bracket_start = int(doc.find('.//bracket_start').text)
3335             bracket_end = int(doc.find('.//bracket_end').text)
3336
3337             # TODO determine start (and probably fix up file)
3338             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3339             #video_url += u'?start=' + TODO:start_timestamp
3340             # bracket_start is 13290, but we want 51670615
3341             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3342                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3343
3344             info = {
3345                 'id': u'c' + chapter_id,
3346                 'url': video_url,
3347                 'ext': video_ext,
3348                 'title': chapter_info['title'],
3349                 'thumbnail': chapter_info['preview'],
3350                 'description': chapter_info['description'],
3351                 'uploader': chapter_info['channel']['display_name'],
3352                 'uploader_id': chapter_info['channel']['name'],
3353             }
3354             return [info]
3355         else:
3356             video_id = mobj.group('videoid')
3357             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3358
3359         self.report_extraction(video_id)
3360
3361         info = []
3362         offset = 0
3363         limit = self._JUSTIN_PAGE_LIMIT
3364         while True:
3365             if paged:
3366                 self.report_download_page(video_id, offset)
3367             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3368             page_count, page_info = self._parse_page(page_url, video_id)
3369             info.extend(page_info)
3370             if not paged or page_count != limit:
3371                 break
3372             offset += limit
3373         return info
3374
3375 class FunnyOrDieIE(InfoExtractor):
3376     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3377
3378     def _real_extract(self, url):
3379         mobj = re.match(self._VALID_URL, url)
3380         if mobj is None:
3381             raise ExtractorError(u'invalid URL: %s' % url)
3382
3383         video_id = mobj.group('id')
3384         webpage = self._download_webpage(url, video_id)
3385
3386         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3387         if not m:
3388             raise ExtractorError(u'Unable to find video information')
3389         video_url = unescapeHTML(m.group('url'))
3390
3391         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3392         if not m:
3393             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3394             if not m:
3395                 raise ExtractorError(u'Cannot find video title')
3396         title = clean_html(m.group('title'))
3397
3398         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3399         if m:
3400             desc = unescapeHTML(m.group('desc'))
3401         else:
3402             desc = None
3403
3404         info = {
3405             'id': video_id,
3406             'url': video_url,
3407             'ext': 'mp4',
3408             'title': title,
3409             'description': desc,
3410         }
3411         return [info]
3412
3413 class SteamIE(InfoExtractor):
3414     _VALID_URL = r"""http://store\.steampowered\.com/
3415                 (agecheck/)?
3416                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3417                 (?P<gameID>\d+)/?
3418                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3419                 """
3420
3421     @classmethod
3422     def suitable(cls, url):
3423         """Receives a URL and returns True if suitable for this IE."""
3424         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3425
3426     def _real_extract(self, url):
3427         m = re.match(self._VALID_URL, url, re.VERBOSE)
3428         gameID = m.group('gameID')
3429         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3430         self.report_age_confirmation()
3431         webpage = self._download_webpage(videourl, gameID)
3432         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3433
3434         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3435         mweb = re.finditer(urlRE, webpage)
3436         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3437         titles = re.finditer(namesRE, webpage)
3438         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3439         thumbs = re.finditer(thumbsRE, webpage)
3440         videos = []
3441         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3442             video_id = vid.group('videoID')
3443             title = vtitle.group('videoName')
3444             video_url = vid.group('videoURL')
3445             video_thumb = thumb.group('thumbnail')
3446             if not video_url:
3447                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3448             info = {
3449                 'id':video_id,
3450                 'url':video_url,
3451                 'ext': 'flv',
3452                 'title': unescapeHTML(title),
3453                 'thumbnail': video_thumb
3454                   }
3455             videos.append(info)
3456         return [self.playlist_result(videos, gameID, game_title)]
3457
3458 class UstreamIE(InfoExtractor):
3459     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3460     IE_NAME = u'ustream'
3461
3462     def _real_extract(self, url):
3463         m = re.match(self._VALID_URL, url)
3464         video_id = m.group('videoID')
3465         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3466         webpage = self._download_webpage(url, video_id)
3467         self.report_extraction(video_id)
3468         try:
3469             m = re.search(r'data-title="(?P<title>.+)"',webpage)
3470             title = m.group('title')
3471             m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3472                           webpage, re.DOTALL)
3473             uploader = unescapeHTML(m.group('uploader').strip())
3474             m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
3475             thumb = m.group('thumb')
3476         except AttributeError:
3477             raise ExtractorError(u'Unable to extract info')
3478         info = {
3479                 'id':video_id,
3480                 'url':video_url,
3481                 'ext': 'flv',
3482                 'title': title,
3483                 'uploader': uploader,
3484                 'thumbnail': thumb,
3485                   }
3486         return info
3487
3488 class WorldStarHipHopIE(InfoExtractor):
3489     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3490     IE_NAME = u'WorldStarHipHop'
3491
3492     def _real_extract(self, url):
3493         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3494
3495         m = re.match(self._VALID_URL, url)
3496         video_id = m.group('id')
3497
3498         webpage_src = self._download_webpage(url, video_id)
3499
3500         mobj = re.search(_src_url, webpage_src)
3501
3502         if mobj is not None:
3503             video_url = mobj.group(1)
3504             if 'mp4' in video_url:
3505                 ext = 'mp4'
3506             else:
3507                 ext = 'flv'
3508         else:
3509             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3510
3511         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3512
3513         if mobj is None:
3514             raise ExtractorError(u'Cannot determine title')
3515         title = mobj.group(1)
3516
3517         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3518         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3519         if mobj is not None:
3520             thumbnail = mobj.group(1)
3521         else:
3522             _title = r"""candytitles.*>(.*)</span>"""
3523             mobj = re.search(_title, webpage_src)
3524             if mobj is not None:
3525                 title = mobj.group(1)
3526             thumbnail = None
3527
3528         results = [{
3529                     'id': video_id,
3530                     'url' : video_url,
3531                     'title' : title,
3532                     'thumbnail' : thumbnail,
3533                     'ext' : ext,
3534                     }]
3535         return results
3536
3537 class RBMARadioIE(InfoExtractor):
3538     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3539
3540     def _real_extract(self, url):
3541         m = re.match(self._VALID_URL, url)
3542         video_id = m.group('videoID')
3543
3544         webpage = self._download_webpage(url, video_id)
3545         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3546         if not m:
3547             raise ExtractorError(u'Cannot find metadata')
3548         json_data = m.group(1)
3549
3550         try:
3551             data = json.loads(json_data)
3552         except ValueError as e:
3553             raise ExtractorError(u'Invalid JSON: ' + str(e))
3554
3555         video_url = data['akamai_url'] + '&cbr=256'
3556         url_parts = compat_urllib_parse_urlparse(video_url)
3557         video_ext = url_parts.path.rpartition('.')[2]
3558         info = {
3559                 'id': video_id,
3560                 'url': video_url,
3561                 'ext': video_ext,
3562                 'title': data['title'],
3563                 'description': data.get('teaser_text'),
3564                 'location': data.get('country_of_origin'),
3565                 'uploader': data.get('host', {}).get('name'),
3566                 'uploader_id': data.get('host', {}).get('slug'),
3567                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3568                 'duration': data.get('duration'),
3569         }
3570         return [info]
3571
3572
3573 class YouPornIE(InfoExtractor):
3574     """Information extractor for youporn.com."""
3575     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3576
3577     def _print_formats(self, formats):
3578         """Print all available formats"""
3579         print(u'Available formats:')
3580         print(u'ext\t\tformat')
3581         print(u'---------------------------------')
3582         for format in formats:
3583             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3584
3585     def _specific(self, req_format, formats):
3586         for x in formats:
3587             if(x["format"]==req_format):
3588                 return x
3589         return None
3590
3591     def _real_extract(self, url):
3592         mobj = re.match(self._VALID_URL, url)
3593         if mobj is None:
3594             raise ExtractorError(u'Invalid URL: %s' % url)
3595
3596         video_id = mobj.group('videoid')
3597
3598         req = compat_urllib_request.Request(url)
3599         req.add_header('Cookie', 'age_verified=1')
3600         webpage = self._download_webpage(req, video_id)
3601
3602         # Get the video title
3603         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3604         if result is None:
3605             raise ExtractorError(u'Unable to extract video title')
3606         video_title = result.group('title').strip()
3607
3608         # Get the video date
3609         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3610         if result is None:
3611             self._downloader.report_warning(u'unable to extract video date')
3612             upload_date = None
3613         else:
3614             upload_date = unified_strdate(result.group('date').strip())
3615
3616         # Get the video uploader
3617         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3618         if result is None:
3619             self._downloader.report_warning(u'unable to extract uploader')
3620             video_uploader = None
3621         else:
3622             video_uploader = result.group('uploader').strip()
3623             video_uploader = clean_html( video_uploader )
3624
3625         # Get all of the formats available
3626         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3627         result = re.search(DOWNLOAD_LIST_RE, webpage)
3628         if result is None:
3629             raise ExtractorError(u'Unable to extract download list')
3630         download_list_html = result.group('download_list').strip()
3631
3632         # Get all of the links from the page
3633         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3634         links = re.findall(LINK_RE, download_list_html)
3635         if(len(links) == 0):
3636             raise ExtractorError(u'ERROR: no known formats available for video')
3637
3638         self.to_screen(u'Links found: %d' % len(links))
3639
3640         formats = []
3641         for link in links:
3642
3643             # A link looks like this:
3644             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3645             # A path looks like this:
3646             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3647             video_url = unescapeHTML( link )
3648             path = compat_urllib_parse_urlparse( video_url ).path
3649             extension = os.path.splitext( path )[1][1:]
3650             format = path.split('/')[4].split('_')[:2]
3651             size = format[0]
3652             bitrate = format[1]
3653             format = "-".join( format )
3654             title = u'%s-%s-%s' % (video_title, size, bitrate)
3655
3656             formats.append({
3657                 'id': video_id,
3658                 'url': video_url,
3659                 'uploader': video_uploader,
3660                 'upload_date': upload_date,
3661                 'title': title,
3662                 'ext': extension,
3663                 'format': format,
3664                 'thumbnail': None,
3665                 'description': None,
3666                 'player_url': None
3667             })
3668
3669         if self._downloader.params.get('listformats', None):
3670             self._print_formats(formats)
3671             return
3672
3673         req_format = self._downloader.params.get('format', None)
3674         self.to_screen(u'Format: %s' % req_format)
3675
3676         if req_format is None or req_format == 'best':
3677             return [formats[0]]
3678         elif req_format == 'worst':
3679             return [formats[-1]]
3680         elif req_format in ('-1', 'all'):
3681             return formats
3682         else:
3683             format = self._specific( req_format, formats )
3684             if result is None:
3685                 raise ExtractorError(u'Requested format not available')
3686             return [format]
3687
3688
3689
3690 class PornotubeIE(InfoExtractor):
3691     """Information extractor for pornotube.com."""
3692     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3693
3694     def _real_extract(self, url):
3695         mobj = re.match(self._VALID_URL, url)
3696         if mobj is None:
3697             raise ExtractorError(u'Invalid URL: %s' % url)
3698
3699         video_id = mobj.group('videoid')
3700         video_title = mobj.group('title')
3701
3702         # Get webpage content
3703         webpage = self._download_webpage(url, video_id)
3704
3705         # Get the video URL
3706         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3707         result = re.search(VIDEO_URL_RE, webpage)
3708         if result is None:
3709             raise ExtractorError(u'Unable to extract video url')
3710         video_url = compat_urllib_parse.unquote(result.group('url'))
3711
3712         #Get the uploaded date
3713         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3714         result = re.search(VIDEO_UPLOADED_RE, webpage)
3715         if result is None:
3716             raise ExtractorError(u'Unable to extract video title')
3717         upload_date = unified_strdate(result.group('date'))
3718
3719         info = {'id': video_id,
3720                 'url': video_url,
3721                 'uploader': None,
3722                 'upload_date': upload_date,
3723                 'title': video_title,
3724                 'ext': 'flv',
3725                 'format': 'flv'}
3726
3727         return [info]
3728
3729 class YouJizzIE(InfoExtractor):
3730     """Information extractor for youjizz.com."""
3731     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3732
3733     def _real_extract(self, url):
3734         mobj = re.match(self._VALID_URL, url)
3735         if mobj is None:
3736             raise ExtractorError(u'Invalid URL: %s' % url)
3737
3738         video_id = mobj.group('videoid')
3739
3740         # Get webpage content
3741         webpage = self._download_webpage(url, video_id)
3742
3743         # Get the video title
3744         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3745         if result is None:
3746             raise ExtractorError(u'ERROR: unable to extract video title')
3747         video_title = result.group('title').strip()
3748
3749         # Get the embed page
3750         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3751         if result is None:
3752             raise ExtractorError(u'ERROR: unable to extract embed page')
3753
3754         embed_page_url = result.group(0).strip()
3755         video_id = result.group('videoid')
3756
3757         webpage = self._download_webpage(embed_page_url, video_id)
3758
3759         # Get the video URL
3760         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3761         if result is None:
3762             raise ExtractorError(u'ERROR: unable to extract video url')
3763         video_url = result.group('source')
3764
3765         info = {'id': video_id,
3766                 'url': video_url,
3767                 'title': video_title,
3768                 'ext': 'flv',
3769                 'format': 'flv',
3770                 'player_url': embed_page_url}
3771
3772         return [info]
3773
3774 class EightTracksIE(InfoExtractor):
3775     IE_NAME = '8tracks'
3776     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3777
3778     def _real_extract(self, url):
3779         mobj = re.match(self._VALID_URL, url)
3780         if mobj is None:
3781             raise ExtractorError(u'Invalid URL: %s' % url)
3782         playlist_id = mobj.group('id')
3783
3784         webpage = self._download_webpage(url, playlist_id)
3785
3786         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3787         if not m:
3788             raise ExtractorError(u'Cannot find trax information')
3789         json_like = m.group(1)
3790         data = json.loads(json_like)
3791
3792         session = str(random.randint(0, 1000000000))
3793         mix_id = data['id']
3794         track_count = data['tracks_count']
3795         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3796         next_url = first_url
3797         res = []
3798         for i in itertools.count():
3799             api_json = self._download_webpage(next_url, playlist_id,
3800                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3801                 errnote=u'Failed to download song information')
3802             api_data = json.loads(api_json)
3803             track_data = api_data[u'set']['track']
3804             info = {
3805                 'id': track_data['id'],
3806                 'url': track_data['track_file_stream_url'],
3807                 'title': track_data['performer'] + u' - ' + track_data['name'],
3808                 'raw_title': track_data['name'],
3809                 'uploader_id': data['user']['login'],
3810                 'ext': 'm4a',
3811             }
3812             res.append(info)
3813             if api_data['set']['at_last_track']:
3814                 break
3815             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3816         return res
3817
3818 class KeekIE(InfoExtractor):
3819     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3820     IE_NAME = u'keek'
3821
3822     def _real_extract(self, url):
3823         m = re.match(self._VALID_URL, url)
3824         video_id = m.group('videoID')
3825         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3826         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3827         webpage = self._download_webpage(url, video_id)
3828         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3829         title = unescapeHTML(m.group('title'))
3830         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3831         uploader = clean_html(m.group('uploader'))
3832         info = {
3833                 'id': video_id,
3834                 'url': video_url,
3835                 'ext': 'mp4',
3836                 'title': title,
3837                 'thumbnail': thumbnail,
3838                 'uploader': uploader
3839         }
3840         return [info]
3841
3842 class TEDIE(InfoExtractor):
3843     _VALID_URL=r'''http://www\.ted\.com/
3844                    (
3845                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3846                         |
3847                         ((?P<type_talk>talks)) # We have a simple talk
3848                    )
3849                    (/lang/(.*?))? # The url may contain the language
3850                    /(?P<name>\w+) # Here goes the name and then ".html"
3851                    '''
3852
3853     @classmethod
3854     def suitable(cls, url):
3855         """Receives a URL and returns True if suitable for this IE."""
3856         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3857
3858     def _real_extract(self, url):
3859         m=re.match(self._VALID_URL, url, re.VERBOSE)
3860         if m.group('type_talk'):
3861             return [self._talk_info(url)]
3862         else :
3863             playlist_id=m.group('playlist_id')
3864             name=m.group('name')
3865             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3866             return [self._playlist_videos_info(url,name,playlist_id)]
3867
3868     def _talk_video_link(self,mediaSlug):
3869         '''Returns the video link for that mediaSlug'''
3870         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3871
3872     def _playlist_videos_info(self,url,name,playlist_id=0):
3873         '''Returns the videos of the playlist'''
3874         video_RE=r'''
3875                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3876                      ([.\s]*?)data-playlist_item_id="(\d+)"
3877                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3878                      '''
3879         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3880         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3881         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3882         m_names=re.finditer(video_name_RE,webpage)
3883
3884         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3885         m_playlist = re.search(playlist_RE, webpage)
3886         playlist_title = m_playlist.group('playlist_title')
3887
3888         playlist_entries = []
3889         for m_video, m_name in zip(m_videos,m_names):
3890             video_id=m_video.group('video_id')
3891             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3892             playlist_entries.append(self.url_result(talk_url, 'TED'))
3893         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3894
3895     def _talk_info(self, url, video_id=0):
3896         """Return the video for the talk in the url"""
3897         m=re.match(self._VALID_URL, url,re.VERBOSE)
3898         videoName=m.group('name')
3899         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3900         # If the url includes the language we get the title translated
3901         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3902         title=re.search(title_RE, webpage).group('title')
3903         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3904                         "id":(?P<videoID>[\d]+).*?
3905                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3906         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3907         thumb_match=re.search(thumb_RE,webpage)
3908         info_match=re.search(info_RE,webpage,re.VERBOSE)
3909         video_id=info_match.group('videoID')
3910         mediaSlug=info_match.group('mediaSlug')
3911         video_url=self._talk_video_link(mediaSlug)
3912         info = {
3913                 'id': video_id,
3914                 'url': video_url,
3915                 'ext': 'mp4',
3916                 'title': title,
3917                 'thumbnail': thumb_match.group('thumbnail')
3918                 }
3919         return info
3920
3921 class MySpassIE(InfoExtractor):
3922     _VALID_URL = r'http://www.myspass.de/.*'
3923
3924     def _real_extract(self, url):
3925         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3926
3927         # video id is the last path element of the URL
3928         # usually there is a trailing slash, so also try the second but last
3929         url_path = compat_urllib_parse_urlparse(url).path
3930         url_parent_path, video_id = os.path.split(url_path)
3931         if not video_id:
3932             _, video_id = os.path.split(url_parent_path)
3933
3934         # get metadata
3935         metadata_url = META_DATA_URL_TEMPLATE % video_id
3936         metadata_text = self._download_webpage(metadata_url, video_id)
3937         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3938
3939         # extract values from metadata
3940         url_flv_el = metadata.find('url_flv')
3941         if url_flv_el is None:
3942             raise ExtractorError(u'Unable to extract download url')
3943         video_url = url_flv_el.text
3944         extension = os.path.splitext(video_url)[1][1:]
3945         title_el = metadata.find('title')
3946         if title_el is None:
3947             raise ExtractorError(u'Unable to extract title')
3948         title = title_el.text
3949         format_id_el = metadata.find('format_id')
3950         if format_id_el is None:
3951             format = ext
3952         else:
3953             format = format_id_el.text
3954         description_el = metadata.find('description')
3955         if description_el is not None:
3956             description = description_el.text
3957         else:
3958             description = None
3959         imagePreview_el = metadata.find('imagePreview')
3960         if imagePreview_el is not None:
3961             thumbnail = imagePreview_el.text
3962         else:
3963             thumbnail = None
3964         info = {
3965             'id': video_id,
3966             'url': video_url,
3967             'title': title,
3968             'ext': extension,
3969             'format': format,
3970             'thumbnail': thumbnail,
3971             'description': description
3972         }
3973         return [info]
3974
3975 class SpiegelIE(InfoExtractor):
3976     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3977
3978     def _real_extract(self, url):
3979         m = re.match(self._VALID_URL, url)
3980         video_id = m.group('videoID')
3981
3982         webpage = self._download_webpage(url, video_id)
3983         m = re.search(r'<div class="module-title">(.*?)</div>', webpage)
3984         if not m:
3985             raise ExtractorError(u'Cannot find title')
3986         video_title = unescapeHTML(m.group(1))
3987
3988         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3989         xml_code = self._download_webpage(xml_url, video_id,
3990                     note=u'Downloading XML', errnote=u'Failed to download XML')
3991
3992         idoc = xml.etree.ElementTree.fromstring(xml_code)
3993         last_type = idoc[-1]
3994         filename = last_type.findall('./filename')[0].text
3995         duration = float(last_type.findall('./duration')[0].text)
3996
3997         video_url = 'http://video2.spiegel.de/flash/' + filename
3998         video_ext = filename.rpartition('.')[2]
3999         info = {
4000             'id': video_id,
4001             'url': video_url,
4002             'ext': video_ext,
4003             'title': video_title,
4004             'duration': duration,
4005         }
4006         return [info]
4007
4008 class LiveLeakIE(InfoExtractor):
4009
4010     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4011     IE_NAME = u'liveleak'
4012
4013     def _real_extract(self, url):
4014         mobj = re.match(self._VALID_URL, url)
4015         if mobj is None:
4016             raise ExtractorError(u'Invalid URL: %s' % url)
4017
4018         video_id = mobj.group('video_id')
4019
4020         webpage = self._download_webpage(url, video_id)
4021
4022         m = re.search(r'file: "(.*?)",', webpage)
4023         if not m:
4024             raise ExtractorError(u'Unable to find video url')
4025         video_url = m.group(1)
4026
4027         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4028         if not m:
4029             raise ExtractorError(u'Cannot find video title')
4030         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4031
4032         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4033         if m:
4034             desc = unescapeHTML(m.group('desc'))
4035         else:
4036             desc = None
4037
4038         m = re.search(r'By:.*?(\w+)</a>', webpage)
4039         if m:
4040             uploader = clean_html(m.group(1))
4041         else:
4042             uploader = None
4043
4044         info = {
4045             'id':  video_id,
4046             'url': video_url,
4047             'ext': 'mp4',
4048             'title': title,
4049             'description': desc,
4050             'uploader': uploader
4051         }
4052
4053         return [info]
4054
4055 class ARDIE(InfoExtractor):
4056     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4057     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4058     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4059
4060     def _real_extract(self, url):
4061         # determine video id from url
4062         m = re.match(self._VALID_URL, url)
4063
4064         numid = re.search(r'documentId=([0-9]+)', url)
4065         if numid:
4066             video_id = numid.group(1)
4067         else:
4068             video_id = m.group('video_id')
4069
4070         # determine title and media streams from webpage
4071         html = self._download_webpage(url, video_id)
4072         title = re.search(self._TITLE, html).group('title')
4073         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4074         if not streams:
4075             assert '"fsk"' in html
4076             raise ExtractorError(u'This video is only available after 8:00 pm')
4077
4078         # choose default media type and highest quality for now
4079         stream = max([s for s in streams if int(s["media_type"]) == 0],
4080                      key=lambda s: int(s["quality"]))
4081
4082         # there's two possibilities: RTMP stream or HTTP download
4083         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4084         if stream['rtmp_url']:
4085             self.to_screen(u'RTMP download detected')
4086             assert stream['video_url'].startswith('mp4:')
4087             info["url"] = stream["rtmp_url"]
4088             info["play_path"] = stream['video_url']
4089         else:
4090             assert stream["video_url"].endswith('.mp4')
4091             info["url"] = stream["video_url"]
4092         return [info]
4093
4094 class ZDFIE(InfoExtractor):
4095     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4096     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4097     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4098     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4099     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4100
4101     def _real_extract(self, url):
4102         mobj = re.match(self._VALID_URL, url)
4103         if mobj is None:
4104             raise ExtractorError(u'Invalid URL: %s' % url)
4105         video_id = mobj.group('video_id')
4106
4107         html = self._download_webpage(url, video_id)
4108         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4109         if streams is None:
4110             raise ExtractorError(u'No media url found.')
4111
4112         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4113         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4114         # choose first/default media type and highest quality for now
4115         for s in streams:        #find 300 - dsl1000mbit
4116             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4117                 stream_=s
4118                 break
4119         for s in streams:        #find veryhigh - dsl2000mbit
4120             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4121                 stream_=s
4122                 break
4123         if stream_ is None:
4124             raise ExtractorError(u'No stream found.')
4125
4126         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4127
4128         self.report_extraction(video_id)
4129         mobj = re.search(self._TITLE, html)
4130         if mobj is None:
4131             raise ExtractorError(u'Cannot extract title')
4132         title = unescapeHTML(mobj.group('title'))
4133
4134         mobj = re.search(self._MMS_STREAM, media_link)
4135         if mobj is None:
4136             mobj = re.search(self._RTSP_STREAM, media_link)
4137             if mobj is None:
4138                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4139         mms_url = mobj.group('video_url')
4140
4141         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4142         if mobj is None:
4143             raise ExtractorError(u'Cannot extract extention')
4144         ext = mobj.group('ext')
4145
4146         return [{'id': video_id,
4147                  'url': mms_url,
4148                  'title': title,
4149                  'ext': ext
4150                  }]
4151
4152 class TumblrIE(InfoExtractor):
4153     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4154
4155     def _real_extract(self, url):
4156         m_url = re.match(self._VALID_URL, url)
4157         video_id = m_url.group('id')
4158         blog = m_url.group('blog_name')
4159
4160         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4161         webpage = self._download_webpage(url, video_id)
4162
4163         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4164         video = re.search(re_video, webpage)
4165         if video is None:
4166             self.to_screen("No video found")
4167             return []
4168         video_url = video.group('video_url')
4169         ext = video.group('ext')
4170
4171         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4172         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4173
4174         # The only place where you can get a title, it's not complete,
4175         # but searching in other places doesn't work for all videos
4176         re_title = r'<title>(?P<title>.*?)</title>'
4177         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4178
4179         return [{'id': video_id,
4180                  'url': video_url,
4181                  'title': title,
4182                  'thumbnail': thumb,
4183                  'ext': ext
4184                  }]
4185
4186 class BandcampIE(InfoExtractor):
4187     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4188
4189     def _real_extract(self, url):
4190         mobj = re.match(self._VALID_URL, url)
4191         title = mobj.group('title')
4192         webpage = self._download_webpage(url, title)
4193         # We get the link to the free download page
4194         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4195         if m_download is None:
4196             raise ExtractorError(u'No free songs founded')
4197
4198         download_link = m_download.group(1)
4199         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4200                        webpage, re.MULTILINE|re.DOTALL).group('id')
4201
4202         download_webpage = self._download_webpage(download_link, id,
4203                                                   'Downloading free downloads page')
4204         # We get the dictionary of the track from some javascrip code
4205         info = re.search(r'items: (.*?),$',
4206                          download_webpage, re.MULTILINE).group(1)
4207         info = json.loads(info)[0]
4208         # We pick mp3-320 for now, until format selection can be easily implemented.
4209         mp3_info = info[u'downloads'][u'mp3-320']
4210         # If we try to use this url it says the link has expired
4211         initial_url = mp3_info[u'url']
4212         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4213         m_url = re.match(re_url, initial_url)
4214         #We build the url we will use to get the final track url
4215         # This url is build in Bandcamp in the script download_bunde_*.js
4216         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4217         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4218         # If we could correctly generate the .rand field the url would be
4219         #in the "download_url" key
4220         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4221
4222         track_info = {'id':id,
4223                       'title' : info[u'title'],
4224                       'ext' : 'mp3',
4225                       'url' : final_url,
4226                       'thumbnail' : info[u'thumb_url'],
4227                       'uploader' : info[u'artist']
4228                       }
4229
4230         return [track_info]
4231
4232 class RedTubeIE(InfoExtractor):
4233     """Information Extractor for redtube"""
4234     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4235
4236     def _real_extract(self,url):
4237         mobj = re.match(self._VALID_URL, url)
4238         if mobj is None:
4239             raise ExtractorError(u'Invalid URL: %s' % url)
4240
4241         video_id = mobj.group('id')
4242         video_extension = 'mp4'
4243         webpage = self._download_webpage(url, video_id)
4244         self.report_extraction(video_id)
4245         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4246
4247         if mobj is None:
4248             raise ExtractorError(u'Unable to extract media URL')
4249
4250         video_url = mobj.group(1)
4251         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4252         if mobj is None:
4253             raise ExtractorError(u'Unable to extract title')
4254         video_title = mobj.group(1)
4255
4256         return [{
4257             'id':       video_id,
4258             'url':      video_url,
4259             'ext':      video_extension,
4260             'title':    video_title,
4261         }]
4262
4263 class InaIE(InfoExtractor):
4264     """Information Extractor for Ina.fr"""
4265     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4266
4267     def _real_extract(self,url):
4268         mobj = re.match(self._VALID_URL, url)
4269
4270         video_id = mobj.group('id')
4271         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4272         video_extension = 'mp4'
4273         webpage = self._download_webpage(mrss_url, video_id)
4274
4275         mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4276         if mobj is None:
4277             raise ExtractorError(u'Unable to extract media URL')
4278         video_url = mobj.group(1)
4279
4280         mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4281         if mobj is None:
4282             raise ExtractorError(u'Unable to extract title')
4283         video_title = mobj.group(1)
4284
4285         return [{
4286             'id':       video_id,
4287             'url':      video_url,
4288             'ext':      video_extension,
4289             'title':    video_title,
4290         }]
4291
4292 class HowcastIE(InfoExtractor):
4293     """Information Extractor for Howcast.com"""
4294     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4295
4296     def _real_extract(self, url):
4297         mobj = re.match(self._VALID_URL, url)
4298
4299         video_id = mobj.group('id')
4300         webpage_url = 'http://www.howcast.com/videos/' + video_id
4301         webpage = self._download_webpage(webpage_url, video_id)
4302
4303         self.report_extraction(video_id)
4304
4305         mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
4306         if mobj is None:
4307             raise ExtractorError(u'Unable to extract video URL')
4308         video_url = mobj.group(1)
4309
4310         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
4311         if mobj is None:
4312             raise ExtractorError(u'Unable to extract title')
4313         video_title = mobj.group(1) or mobj.group(2)
4314
4315         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
4316         if mobj is None:
4317             self._downloader.report_warning(u'unable to extract description')
4318             video_description = None
4319         else:
4320             video_description = mobj.group(1) or mobj.group(2)
4321
4322         mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
4323         if mobj is None:
4324             raise ExtractorError(u'Unable to extract thumbnail')
4325         thumbnail = mobj.group(1)
4326
4327         return [{
4328             'id':       video_id,
4329             'url':      video_url,
4330             'ext':      'mp4',
4331             'title':    video_title,
4332             'description': video_description,
4333             'thumbnail': thumbnail,
4334         }]
4335
4336 class VineIE(InfoExtractor):
4337     """Information Extractor for Vine.co"""
4338     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4339
4340     def _real_extract(self, url):
4341
4342         mobj = re.match(self._VALID_URL, url)
4343
4344         video_id = mobj.group('id')
4345         webpage_url = 'https://vine.co/v/' + video_id
4346         webpage = self._download_webpage(webpage_url, video_id)
4347
4348         self.report_extraction(video_id)
4349
4350         mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
4351         if mobj is None:
4352             raise ExtractorError(u'Unable to extract video URL')
4353         video_url = mobj.group(1)
4354
4355         mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4356         if mobj is None:
4357             raise ExtractorError(u'Unable to extract title')
4358         video_title = mobj.group(1)
4359
4360         mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
4361         if mobj is None:
4362             raise ExtractorError(u'Unable to extract thumbnail')
4363         thumbnail = mobj.group(1)
4364
4365         mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
4366         if mobj is None:
4367             raise ExtractorError(u'Unable to extract uploader')
4368         uploader = mobj.group(1)
4369
4370         return [{
4371             'id':        video_id,
4372             'url':       video_url,
4373             'ext':       'mp4',
4374             'title':     video_title,
4375             'thumbnail': thumbnail,
4376             'uploader':  uploader,
4377         }]
4378
4379 class FlickrIE(InfoExtractor):
4380     """Information Extractor for Flickr videos"""
4381     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4382
4383     def _real_extract(self, url):
4384         mobj = re.match(self._VALID_URL, url)
4385
4386         video_id = mobj.group('id')
4387         video_uploader_id = mobj.group('uploader_id')
4388         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4389         webpage = self._download_webpage(webpage_url, video_id)
4390
4391         mobj = re.search(r"photo_secret: '(\w+)'", webpage)
4392         if mobj is None:
4393             raise ExtractorError(u'Unable to extract video secret')
4394         secret = mobj.group(1)
4395
4396         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4397         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4398
4399         mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
4400         if mobj is None:
4401             raise ExtractorError(u'Unable to extract node_id')
4402         node_id = mobj.group(1)
4403
4404         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4405         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4406
4407         self.report_extraction(video_id)
4408
4409         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4410         if mobj is None:
4411             raise ExtractorError(u'Unable to extract video url')
4412         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4413
4414         mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4415         if mobj is None:
4416             raise ExtractorError(u'Unable to extract title')
4417         video_title = mobj.group(1) or mobj.group(2)
4418
4419         mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4420         if mobj is None:
4421             self._downloader.report_warning(u'unable to extract description')
4422             video_description = None
4423         else:
4424             video_description = mobj.group(1) or mobj.group(2)
4425
4426         mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4427         if mobj is None:
4428             raise ExtractorError(u'Unable to extract thumbnail')
4429         thumbnail = mobj.group(1) or mobj.group(2)
4430
4431         return [{
4432             'id':          video_id,
4433             'url':         video_url,
4434             'ext':         'mp4',
4435             'title':       video_title,
4436             'description': video_description,
4437             'thumbnail':   thumbnail,
4438             'uploader_id': video_uploader_id,
4439         }]
4440
4441 class TeamcocoIE(InfoExtractor):
4442     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4443
4444     def _real_extract(self, url):
4445         mobj = re.match(self._VALID_URL, url)
4446         if mobj is None:
4447             raise ExtractorError(u'Invalid URL: %s' % url)
4448         url_title = mobj.group('url_title')
4449         webpage = self._download_webpage(url, url_title)
4450
4451         mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
4452         video_id = mobj.group(1)
4453
4454         self.report_extraction(video_id)
4455
4456         mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4457         if mobj is None:
4458             raise ExtractorError(u'Unable to extract title')
4459         video_title = mobj.group(1)
4460
4461         mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
4462         if mobj is None:
4463             raise ExtractorError(u'Unable to extract thumbnail')
4464         thumbnail = mobj.group(1)
4465
4466         mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
4467         if mobj is None:
4468             raise ExtractorError(u'Unable to extract description')
4469         description = mobj.group(1)
4470
4471         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4472         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4473         mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
4474         if mobj is None:
4475             raise ExtractorError(u'Unable to extract video url')
4476         video_url = mobj.group(1)
4477
4478         return [{
4479             'id':          video_id,
4480             'url':         video_url,
4481             'ext':         'mp4',
4482             'title':       video_title,
4483             'thumbnail':   thumbnail,
4484             'description': description,
4485         }]
4486
4487 class XHamsterIE(InfoExtractor):
4488     """Information Extractor for xHamster"""
4489     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4490
4491     def _real_extract(self,url):
4492         mobj = re.match(self._VALID_URL, url)
4493
4494         video_id = mobj.group('id')
4495         mrss_url='http://xhamster.com/movies/%s/.html' % video_id
4496         webpage = self._download_webpage(mrss_url, video_id)
4497         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4498         if mobj is None:
4499             raise ExtractorError(u'Unable to extract media URL')
4500         if len(mobj.group('server')) == 0:
4501             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4502         else:
4503             video_url = mobj.group('server')+'/key='+mobj.group('file')
4504         video_extension = video_url.split('.')[-1]
4505
4506         mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
4507         if mobj is None:
4508             raise ExtractorError(u'Unable to extract title')
4509         video_title = unescapeHTML(mobj.group('title'))
4510
4511         mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
4512         if mobj is None:
4513             video_description = u''
4514         else:
4515             video_description = unescapeHTML(mobj.group('description'))
4516
4517         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4518         if mobj is None:
4519             raise ExtractorError(u'Unable to extract upload date')
4520         video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4521
4522         mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
4523         if mobj is None:
4524             video_uploader_id = u'anonymous'
4525         else:
4526             video_uploader_id = mobj.group('uploader_id')
4527
4528         mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
4529         if mobj is None:
4530             raise ExtractorError(u'Unable to extract thumbnail URL')
4531         video_thumbnail = mobj.group('thumbnail')
4532
4533         return [{
4534             'id':       video_id,
4535             'url':      video_url,
4536             'ext':      video_extension,
4537             'title':    video_title,
4538             'description': video_description,
4539             'upload_date': video_upload_date,
4540             'uploader_id': video_uploader_id,
4541             'thumbnail': video_thumbnail
4542         }]
4543
4544 class HypemIE(InfoExtractor):
4545     """Information Extractor for hypem"""
4546     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4547
4548     def _real_extract(self, url):
4549         mobj = re.match(self._VALID_URL, url)
4550         if mobj is None:
4551             raise ExtractorError(u'Invalid URL: %s' % url)
4552         track_id = mobj.group(1)
4553
4554         data = { 'ax': 1, 'ts': time.time() }
4555         data_encoded = compat_urllib_parse.urlencode(data)
4556         complete_url = url + "?" + data_encoded
4557         request = compat_urllib_request.Request(complete_url)
4558         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4559         cookie = urlh.headers.get('Set-Cookie', '')
4560
4561         self.report_extraction(track_id)
4562         mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
4563         if mobj is None:
4564             raise ExtractorError(u'Unable to extrack tracks')
4565         html_tracks = mobj.group(1).strip()
4566         try:
4567             track_list = json.loads(html_tracks)
4568             track = track_list[u'tracks'][0]
4569         except ValueError:
4570             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4571
4572         key = track[u"key"]
4573         track_id = track[u"id"]
4574         artist = track[u"artist"]
4575         title = track[u"song"]
4576
4577         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4578         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4579         request.add_header('cookie', cookie)
4580         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4581         try:
4582             song_data = json.loads(song_data_json)
4583         except ValueError:
4584             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4585         final_url = song_data[u"url"]
4586
4587         return [{
4588             'id':       track_id,
4589             'url':      final_url,
4590             'ext':      "mp3",
4591             'title':    title,
4592             'artist':   artist,
4593         }]
4594
4595 class Vbox7IE(InfoExtractor):
4596     """Information Extractor for Vbox7"""
4597     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4598
4599     def _real_extract(self,url):
4600         mobj = re.match(self._VALID_URL, url)
4601         if mobj is None:
4602             raise ExtractorError(u'Invalid URL: %s' % url)
4603         video_id = mobj.group(1)
4604
4605         redirect_page, urlh = self._download_webpage_handle(url, video_id)
4606         redirect_url = urlh.geturl() + re.search(r'window\.location = \'(.*)\';', redirect_page).group(1)
4607         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4608
4609         title = re.search(r'<title>(.*)</title>', webpage)
4610         title = (title.group(1)).split('/')[0].strip()
4611
4612         ext = "flv"
4613         info_url = "http://vbox7.com/play/magare.do"
4614         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4615         info_request = compat_urllib_request.Request(info_url, data)
4616         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4617         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4618         if info_response is None:
4619             raise ExtractorError(u'Unable to extract the media url')
4620         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4621
4622         return [{
4623             'id':        video_id,
4624             'url':       final_url,
4625             'ext':       ext,
4626             'title':     title,
4627             'thumbnail': thumbnail_url,
4628         }]
4629
4630 def gen_extractors():
4631     """ Return a list of an instance of every supported extractor.
4632     The order does matter; the first extractor matched is the one handling the URL.
4633     """
4634     return [
4635         YoutubePlaylistIE(),
4636         YoutubeChannelIE(),
4637         YoutubeUserIE(),
4638         YoutubeSearchIE(),
4639         YoutubeIE(),
4640         MetacafeIE(),
4641         DailymotionIE(),
4642         GoogleSearchIE(),
4643         PhotobucketIE(),
4644         YahooIE(),
4645         YahooSearchIE(),
4646         DepositFilesIE(),
4647         FacebookIE(),
4648         BlipTVIE(),
4649         BlipTVUserIE(),
4650         VimeoIE(),
4651         MyVideoIE(),
4652         ComedyCentralIE(),
4653         EscapistIE(),
4654         CollegeHumorIE(),
4655         XVideosIE(),
4656         SoundcloudSetIE(),
4657         SoundcloudIE(),
4658         InfoQIE(),
4659         MixcloudIE(),
4660         StanfordOpenClassroomIE(),
4661         MTVIE(),
4662         YoukuIE(),
4663         XNXXIE(),
4664         YouJizzIE(),
4665         PornotubeIE(),
4666         YouPornIE(),
4667         GooglePlusIE(),
4668         ArteTvIE(),
4669         NBAIE(),
4670         WorldStarHipHopIE(),
4671         JustinTVIE(),
4672         FunnyOrDieIE(),
4673         SteamIE(),
4674         UstreamIE(),
4675         RBMARadioIE(),
4676         EightTracksIE(),
4677         KeekIE(),
4678         TEDIE(),
4679         MySpassIE(),
4680         SpiegelIE(),
4681         LiveLeakIE(),
4682         ARDIE(),
4683         ZDFIE(),
4684         TumblrIE(),
4685         BandcampIE(),
4686         RedTubeIE(),
4687         InaIE(),
4688         HowcastIE(),
4689         VineIE(),
4690         FlickrIE(),
4691         TeamcocoIE(),
4692         XHamsterIE(),
4693         HypemIE(),
4694         Vbox7IE(),
4695         GenericIE()
4696     ]
4697
4698 def get_info_extractor(ie_name):
4699     """Returns the info extractor class with the given ie_name"""
4700     return globals()[ie_name+'IE']