youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             self.report_download_webpage(video_id)
 118         elif note is not False:
 119             self.to_screen(u'%s: %s' % (video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns a tuple (page content as string, URL handle) """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         content = webpage_bytes.decode(encoding, 'replace')
 146         return (content, urlh)
 147
 148     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 149         """ Returns the data of the page as a string """
 150         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 151
 152     def to_screen(self, msg):
 153         """Print msg to screen, prefixing it with '[ie_name]'"""
 154         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 155
 156     def report_extraction(self, id_or_name):
 157         """Report information extraction."""
 158         self.to_screen(u'%s: Extracting information' % id_or_name)
 159
 160     def report_download_webpage(self, video_id):
 161         """Report webpage download."""
 162         self.to_screen(u'%s: Downloading webpage' % video_id)
 163
 164     def report_age_confirmation(self):
 165         """Report attempt to confirm age."""
 166         self.to_screen(u'Confirming age')
 167
 168     #Methods for following #608
 169     #They set the correct value of the '_type' key
 170     def video_result(self, video_info):
 171         """Returns a video"""
 172         video_info['_type'] = 'video'
 173         return video_info
 174     def url_result(self, url, ie=None):
 175         """Returns a url that points to a page that should be processed"""
 176         #TODO: ie should be the class used for getting the info
 177         video_info = {'_type': 'url',
 178                       'url': url,
 179                       'ie_key': ie}
 180         return video_info
 181     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 182         """Returns a playlist"""
 183         video_info = {'_type': 'playlist',
 184                       'entries': entries}
 185         if playlist_id:
 186             video_info['id'] = playlist_id
 187         if playlist_title:
 188             video_info['title'] = playlist_title
 189         return video_info
 190
 191
 192 class YoutubeIE(InfoExtractor):
 193     """Information extractor for youtube.com."""
 194
 195     _VALID_URL = r"""^
 196                      (
 197                          (?:https?://)?                                       # http(s):// (optional)
 198                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 199                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 200                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 201                          (?:                                                  # the various things that can precede the ID:
 202                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 203                              |(?:                                             # or the v= param in all its forms
 204                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 205                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 206                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 207                                  v=
 208                              )
 209                          )?                                                   # optional -> youtube.com/xxxx is OK
 210                      )?                                                       # all until now is optional -> you can pass the naked ID
 211                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 212                      (?(1).+)?                                                # if we found the ID, everything can follow
 213                      $"""
 214     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 215     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 216     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 217     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 218     _NETRC_MACHINE = 'youtube'
 219     # Listed in order of quality
 220     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 221     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 222     _video_extensions = {
 223         '13': '3gp',
 224         '17': 'mp4',
 225         '18': 'mp4',
 226         '22': 'mp4',
 227         '37': 'mp4',
 228         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 229         '43': 'webm',
 230         '44': 'webm',
 231         '45': 'webm',
 232         '46': 'webm',
 233     }
 234     _video_dimensions = {
 235         '5': '240x400',
 236         '6': '???',
 237         '13': '???',
 238         '17': '144x176',
 239         '18': '360x640',
 240         '22': '720x1280',
 241         '34': '360x640',
 242         '35': '480x854',
 243         '37': '1080x1920',
 244         '38': '3072x4096',
 245         '43': '360x640',
 246         '44': '480x854',
 247         '45': '720x1280',
 248         '46': '1080x1920',
 249     }
 250     IE_NAME = u'youtube'
 251
 252     @classmethod
 253     def suitable(cls, url):
 254         """Receives a URL and returns True if suitable for this IE."""
 255         if YoutubePlaylistIE.suitable(url): return False
 256         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 257
 258     def report_lang(self):
 259         """Report attempt to set language."""
 260         self.to_screen(u'Setting language')
 261
 262     def report_login(self):
 263         """Report attempt to log in."""
 264         self.to_screen(u'Logging in')
 265
 266     def report_video_webpage_download(self, video_id):
 267         """Report attempt to download video webpage."""
 268         self.to_screen(u'%s: Downloading video webpage' % video_id)
 269
 270     def report_video_info_webpage_download(self, video_id):
 271         """Report attempt to download video info webpage."""
 272         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 273
 274     def report_video_subtitles_download(self, video_id):
 275         """Report attempt to download video info webpage."""
 276         self.to_screen(u'%s: Checking available subtitles' % video_id)
 277
 278     def report_video_subtitles_request(self, video_id, sub_lang, format):
 279         """Report attempt to download video info webpage."""
 280         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 281
 282     def report_video_subtitles_available(self, video_id, sub_lang_list):
 283         """Report available subtitles."""
 284         sub_lang = ",".join(list(sub_lang_list.keys()))
 285         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 286
 287     def report_information_extraction(self, video_id):
 288         """Report attempt to extract video information."""
 289         self.to_screen(u'%s: Extracting video information' % video_id)
 290
 291     def report_unavailable_format(self, video_id, format):
 292         """Report extracted video URL."""
 293         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 294
 295     def report_rtmp_download(self):
 296         """Indicate the download will use the RTMP protocol."""
 297         self.to_screen(u'RTMP download detected')
 298
 299     def _get_available_subtitles(self, video_id):
 300         self.report_video_subtitles_download(video_id)
 301         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 302         try:
 303             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 304         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 305             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 306         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 307         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 308         if not sub_lang_list:
 309             return (u'video doesn\'t have subtitles', None)
 310         return sub_lang_list
 311
 312     def _list_available_subtitles(self, video_id):
 313         sub_lang_list = self._get_available_subtitles(video_id)
 314         self.report_video_subtitles_available(video_id, sub_lang_list)
 315
 316     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 317         """
 318         Return tuple:
 319         (error_message, sub_lang, sub)
 320         """
 321         self.report_video_subtitles_request(video_id, sub_lang, format)
 322         params = compat_urllib_parse.urlencode({
 323             'lang': sub_lang,
 324             'name': sub_name,
 325             'v': video_id,
 326             'fmt': format,
 327         })
 328         url = 'http://www.youtube.com/api/timedtext?' + params
 329         try:
 330             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 331         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 332             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 333         if not sub:
 334             return (u'Did not fetch video subtitles', None, None)
 335         return (None, sub_lang, sub)
 336
 337     def _extract_subtitle(self, video_id):
 338         """
 339         Return a list with a tuple:
 340         [(error_message, sub_lang, sub)]
 341         """
 342         sub_lang_list = self._get_available_subtitles(video_id)
 343         sub_format = self._downloader.params.get('subtitlesformat')
 344         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 345             return [(sub_lang_list[0], None, None)]
 346         if self._downloader.params.get('subtitleslang', False):
 347             sub_lang = self._downloader.params.get('subtitleslang')
 348         elif 'en' in sub_lang_list:
 349             sub_lang = 'en'
 350         else:
 351             sub_lang = list(sub_lang_list.keys())[0]
 352         if not sub_lang in sub_lang_list:
 353             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 354
 355         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 356         return [subtitle]
 357
 358     def _extract_all_subtitles(self, video_id):
 359         sub_lang_list = self._get_available_subtitles(video_id)
 360         sub_format = self._downloader.params.get('subtitlesformat')
 361         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 362             return [(sub_lang_list[0], None, None)]
 363         subtitles = []
 364         for sub_lang in sub_lang_list:
 365             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 366             subtitles.append(subtitle)
 367         return subtitles
 368
 369     def _print_formats(self, formats):
 370         print('Available formats:')
 371         for x in formats:
 372             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 373
 374     def _real_initialize(self):
 375         if self._downloader is None:
 376             return
 377
 378         username = None
 379         password = None
 380         downloader_params = self._downloader.params
 381
 382         # Attempt to use provided username and password or .netrc data
 383         if downloader_params.get('username', None) is not None:
 384             username = downloader_params['username']
 385             password = downloader_params['password']
 386         elif downloader_params.get('usenetrc', False):
 387             try:
 388                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 389                 if info is not None:
 390                     username = info[0]
 391                     password = info[2]
 392                 else:
 393                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 394             except (IOError, netrc.NetrcParseError) as err:
 395                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 396                 return
 397
 398         # Set language
 399         request = compat_urllib_request.Request(self._LANG_URL)
 400         try:
 401             self.report_lang()
 402             compat_urllib_request.urlopen(request).read()
 403         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 404             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 405             return
 406
 407         # No authentication to be performed
 408         if username is None:
 409             return
 410
 411         request = compat_urllib_request.Request(self._LOGIN_URL)
 412         try:
 413             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 416             return
 417
 418         galx = None
 419         dsh = None
 420         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 421         if match:
 422           galx = match.group(1)
 423
 424         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 425         if match:
 426           dsh = match.group(1)
 427
 428         # Log in
 429         login_form_strs = {
 430                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 431                 u'Email': username,
 432                 u'GALX': galx,
 433                 u'Passwd': password,
 434                 u'PersistentCookie': u'yes',
 435                 u'_utf8': u'霱',
 436                 u'bgresponse': u'js_disabled',
 437                 u'checkConnection': u'',
 438                 u'checkedDomains': u'youtube',
 439                 u'dnConn': u'',
 440                 u'dsh': dsh,
 441                 u'pstMsg': u'0',
 442                 u'rmShown': u'1',
 443                 u'secTok': u'',
 444                 u'signIn': u'Sign in',
 445                 u'timeStmp': u'',
 446                 u'service': u'youtube',
 447                 u'uilel': u'3',
 448                 u'hl': u'en_US',
 449         }
 450         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 451         # chokes on unicode
 452         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 453         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 454         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 455         try:
 456             self.report_login()
 457             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 458             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 459                 self._downloader.report_warning(u'unable to log in: bad username or password')
 460                 return
 461         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 462             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 463             return
 464
 465         # Confirm age
 466         age_form = {
 467                 'next_url':     '/',
 468                 'action_confirm':   'Confirm',
 469                 }
 470         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 471         try:
 472             self.report_age_confirmation()
 473             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 474         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 475             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 476             return
 477
 478     def _extract_id(self, url):
 479         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 480         if mobj is None:
 481             self._downloader.report_error(u'invalid URL: %s' % url)
 482             return
 483         video_id = mobj.group(2)
 484         return video_id
 485
 486     def _real_extract(self, url):
 487         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 488         mobj = re.search(self._NEXT_URL_RE, url)
 489         if mobj:
 490             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 491         video_id = self._extract_id(url)
 492
 493         # Get video webpage
 494         self.report_video_webpage_download(video_id)
 495         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 496         request = compat_urllib_request.Request(url)
 497         try:
 498             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 499         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 500             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 501             return
 502
 503         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 504
 505         # Attempt to extract SWF player URL
 506         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 507         if mobj is not None:
 508             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 509         else:
 510             player_url = None
 511
 512         # Get video info
 513         self.report_video_info_webpage_download(video_id)
 514         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 515             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 516                     % (video_id, el_type))
 517             video_info_webpage = self._download_webpage(video_info_url, video_id,
 518                                     note=False,
 519                                     errnote='unable to download video info webpage')
 520             video_info = compat_parse_qs(video_info_webpage)
 521             if 'token' in video_info:
 522                 break
 523         if 'token' not in video_info:
 524             if 'reason' in video_info:
 525                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 526             else:
 527                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 528             return
 529
 530         # Check for "rental" videos
 531         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 532             self._downloader.report_error(u'"rental" videos not supported')
 533             return
 534
 535         # Start extracting information
 536         self.report_information_extraction(video_id)
 537
 538         # uploader
 539         if 'author' not in video_info:
 540             self._downloader.report_error(u'unable to extract uploader name')
 541             return
 542         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 543
 544         # uploader_id
 545         video_uploader_id = None
 546         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 547         if mobj is not None:
 548             video_uploader_id = mobj.group(1)
 549         else:
 550             self._downloader.report_warning(u'unable to extract uploader nickname')
 551
 552         # title
 553         if 'title' not in video_info:
 554             self._downloader.report_error(u'unable to extract video title')
 555             return
 556         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 557
 558         # thumbnail image
 559         if 'thumbnail_url' not in video_info:
 560             self._downloader.report_warning(u'unable to extract video thumbnail')
 561             video_thumbnail = ''
 562         else:   # don't panic if we can't find it
 563             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 564
 565         # upload date
 566         upload_date = None
 567         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 568         if mobj is not None:
 569             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 570             upload_date = unified_strdate(upload_date)
 571
 572         # description
 573         video_description = get_element_by_id("eow-description", video_webpage)
 574         if video_description:
 575             video_description = clean_html(video_description)
 576         else:
 577             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 578             if fd_mobj:
 579                 video_description = unescapeHTML(fd_mobj.group(1))
 580             else:
 581                 video_description = u''
 582
 583         # subtitles
 584         video_subtitles = None
 585
 586         if self._downloader.params.get('writesubtitles', False):
 587             video_subtitles = self._extract_subtitle(video_id)
 588             if video_subtitles:
 589                 (sub_error, sub_lang, sub) = video_subtitles[0]
 590                 if sub_error:
 591                     self._downloader.report_error(sub_error)
 592
 593         if self._downloader.params.get('allsubtitles', False):
 594             video_subtitles = self._extract_all_subtitles(video_id)
 595             for video_subtitle in video_subtitles:
 596                 (sub_error, sub_lang, sub) = video_subtitle
 597                 if sub_error:
 598                     self._downloader.report_error(sub_error)
 599
 600         if self._downloader.params.get('listsubtitles', False):
 601             sub_lang_list = self._list_available_subtitles(video_id)
 602             return
 603
 604         if 'length_seconds' not in video_info:
 605             self._downloader.report_warning(u'unable to extract video duration')
 606             video_duration = ''
 607         else:
 608             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 609
 610         # token
 611         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 612
 613         # Decide which formats to download
 614         req_format = self._downloader.params.get('format', None)
 615
 616         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 617             self.report_rtmp_download()
 618             video_url_list = [(None, video_info['conn'][0])]
 619         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 620             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 621             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 622             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 623             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 624
 625             format_limit = self._downloader.params.get('format_limit', None)
 626             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 627             if format_limit is not None and format_limit in available_formats:
 628                 format_list = available_formats[available_formats.index(format_limit):]
 629             else:
 630                 format_list = available_formats
 631             existing_formats = [x for x in format_list if x in url_map]
 632             if len(existing_formats) == 0:
 633                 raise ExtractorError(u'no known formats available for video')
 634             if self._downloader.params.get('listformats', None):
 635                 self._print_formats(existing_formats)
 636                 return
 637             if req_format is None or req_format == 'best':
 638                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 639             elif req_format == 'worst':
 640                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 641             elif req_format in ('-1', 'all'):
 642                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 643             else:
 644                 # Specific formats. We pick the first in a slash-delimeted sequence.
 645                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 646                 req_formats = req_format.split('/')
 647                 video_url_list = None
 648                 for rf in req_formats:
 649                     if rf in url_map:
 650                         video_url_list = [(rf, url_map[rf])]
 651                         break
 652                 if video_url_list is None:
 653                     raise ExtractorError(u'requested format not available')
 654         else:
 655             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 656
 657         results = []
 658         for format_param, video_real_url in video_url_list:
 659             # Extension
 660             video_extension = self._video_extensions.get(format_param, 'flv')
 661
 662             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 663                                               self._video_dimensions.get(format_param, '???'))
 664
 665             results.append({
 666                 'id':       video_id,
 667                 'url':      video_real_url,
 668                 'uploader': video_uploader,
 669                 'uploader_id': video_uploader_id,
 670                 'upload_date':  upload_date,
 671                 'title':    video_title,
 672                 'ext':      video_extension,
 673                 'format':   video_format,
 674                 'thumbnail':    video_thumbnail,
 675                 'description':  video_description,
 676                 'player_url':   player_url,
 677                 'subtitles':    video_subtitles,
 678                 'duration':     video_duration
 679             })
 680         return results
 681
 682
 683 class MetacafeIE(InfoExtractor):
 684     """Information Extractor for metacafe.com."""
 685
 686     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 687     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 688     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 689     IE_NAME = u'metacafe'
 690
 691     def report_disclaimer(self):
 692         """Report disclaimer retrieval."""
 693         self.to_screen(u'Retrieving disclaimer')
 694
 695     def _real_initialize(self):
 696         # Retrieve disclaimer
 697         request = compat_urllib_request.Request(self._DISCLAIMER)
 698         try:
 699             self.report_disclaimer()
 700             disclaimer = compat_urllib_request.urlopen(request).read()
 701         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 702             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 703             return
 704
 705         # Confirm age
 706         disclaimer_form = {
 707             'filters': '0',
 708             'submit': "Continue - I'm over 18",
 709             }
 710         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 711         try:
 712             self.report_age_confirmation()
 713             disclaimer = compat_urllib_request.urlopen(request).read()
 714         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 715             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 716             return
 717
 718     def _real_extract(self, url):
 719         # Extract id and simplified title from URL
 720         mobj = re.match(self._VALID_URL, url)
 721         if mobj is None:
 722             self._downloader.report_error(u'invalid URL: %s' % url)
 723             return
 724
 725         video_id = mobj.group(1)
 726
 727         # Check if video comes from YouTube
 728         mobj2 = re.match(r'^yt-(.*)$', video_id)
 729         if mobj2 is not None:
 730             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 731
 732         # Retrieve video webpage to extract further information
 733         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 734
 735         # Extract URL, uploader and title from webpage
 736         self.report_extraction(video_id)
 737         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 738         if mobj is not None:
 739             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 740             video_extension = mediaURL[-3:]
 741
 742             # Extract gdaKey if available
 743             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 744             if mobj is None:
 745                 video_url = mediaURL
 746             else:
 747                 gdaKey = mobj.group(1)
 748                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 749         else:
 750             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 751             if mobj is None:
 752                 self._downloader.report_error(u'unable to extract media URL')
 753                 return
 754             vardict = compat_parse_qs(mobj.group(1))
 755             if 'mediaData' not in vardict:
 756                 self._downloader.report_error(u'unable to extract media URL')
 757                 return
 758             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 759             if mobj is None:
 760                 self._downloader.report_error(u'unable to extract media URL')
 761                 return
 762             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 763             video_extension = mediaURL[-3:]
 764             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 765
 766         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 767         if mobj is None:
 768             self._downloader.report_error(u'unable to extract title')
 769             return
 770         video_title = mobj.group(1).decode('utf-8')
 771
 772         mobj = re.search(r'submitter=(.*?);', webpage)
 773         if mobj is None:
 774             self._downloader.report_error(u'unable to extract uploader nickname')
 775             return
 776         video_uploader = mobj.group(1)
 777
 778         return [{
 779             'id':       video_id.decode('utf-8'),
 780             'url':      video_url.decode('utf-8'),
 781             'uploader': video_uploader.decode('utf-8'),
 782             'upload_date':  None,
 783             'title':    video_title,
 784             'ext':      video_extension.decode('utf-8'),
 785         }]
 786
 787 class DailymotionIE(InfoExtractor):
 788     """Information Extractor for Dailymotion"""
 789
 790     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 791     IE_NAME = u'dailymotion'
 792
 793     def _real_extract(self, url):
 794         # Extract id and simplified title from URL
 795         mobj = re.match(self._VALID_URL, url)
 796         if mobj is None:
 797             self._downloader.report_error(u'invalid URL: %s' % url)
 798             return
 799
 800         video_id = mobj.group(1).split('_')[0].split('?')[0]
 801
 802         video_extension = 'mp4'
 803
 804         # Retrieve video webpage to extract further information
 805         request = compat_urllib_request.Request(url)
 806         request.add_header('Cookie', 'family_filter=off')
 807         webpage = self._download_webpage(request, video_id)
 808
 809         # Extract URL, uploader and title from webpage
 810         self.report_extraction(video_id)
 811         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 812         if mobj is None:
 813             self._downloader.report_error(u'unable to extract media URL')
 814             return
 815         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 816
 817         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 818             if key in flashvars:
 819                 max_quality = key
 820                 self.to_screen(u'Using %s' % key)
 821                 break
 822         else:
 823             self._downloader.report_error(u'unable to extract video URL')
 824             return
 825
 826         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 827         if mobj is None:
 828             self._downloader.report_error(u'unable to extract video URL')
 829             return
 830
 831         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 832
 833         # TODO: support choosing qualities
 834
 835         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 836         if mobj is None:
 837             self._downloader.report_error(u'unable to extract title')
 838             return
 839         video_title = unescapeHTML(mobj.group('title'))
 840
 841         video_uploader = None
 842         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 843         if mobj is None:
 844             # lookin for official user
 845             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 846             if mobj_official is None:
 847                 self._downloader.report_warning(u'unable to extract uploader nickname')
 848             else:
 849                 video_uploader = mobj_official.group(1)
 850         else:
 851             video_uploader = mobj.group(1)
 852
 853         video_upload_date = None
 854         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 855         if mobj is not None:
 856             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 857
 858         return [{
 859             'id':       video_id,
 860             'url':      video_url,
 861             'uploader': video_uploader,
 862             'upload_date':  video_upload_date,
 863             'title':    video_title,
 864             'ext':      video_extension,
 865         }]
 866
 867
 868 class PhotobucketIE(InfoExtractor):
 869     """Information extractor for photobucket.com."""
 870
 871     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 872     IE_NAME = u'photobucket'
 873
 874     def _real_extract(self, url):
 875         # Extract id from URL
 876         mobj = re.match(self._VALID_URL, url)
 877         if mobj is None:
 878             self._downloader.report_error(u'Invalid URL: %s' % url)
 879             return
 880
 881         video_id = mobj.group(1)
 882
 883         video_extension = 'flv'
 884
 885         # Retrieve video webpage to extract further information
 886         request = compat_urllib_request.Request(url)
 887         try:
 888             self.report_download_webpage(video_id)
 889             webpage = compat_urllib_request.urlopen(request).read()
 890         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 891             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 892             return
 893
 894         # Extract URL, uploader, and title from webpage
 895         self.report_extraction(video_id)
 896         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 897         if mobj is None:
 898             self._downloader.report_error(u'unable to extract media URL')
 899             return
 900         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 901
 902         video_url = mediaURL
 903
 904         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 905         if mobj is None:
 906             self._downloader.report_error(u'unable to extract title')
 907             return
 908         video_title = mobj.group(1).decode('utf-8')
 909
 910         video_uploader = mobj.group(2).decode('utf-8')
 911
 912         return [{
 913             'id':       video_id.decode('utf-8'),
 914             'url':      video_url.decode('utf-8'),
 915             'uploader': video_uploader,
 916             'upload_date':  None,
 917             'title':    video_title,
 918             'ext':      video_extension.decode('utf-8'),
 919         }]
 920
 921
 922 class YahooIE(InfoExtractor):
 923     """Information extractor for video.yahoo.com."""
 924
 925     _WORKING = False
 926     # _VALID_URL matches all Yahoo! Video URLs
 927     # _VPAGE_URL matches only the extractable '/watch/' URLs
 928     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 929     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 930     IE_NAME = u'video.yahoo'
 931
 932     def _real_extract(self, url, new_video=True):
 933         # Extract ID from URL
 934         mobj = re.match(self._VALID_URL, url)
 935         if mobj is None:
 936             self._downloader.report_error(u'Invalid URL: %s' % url)
 937             return
 938
 939         video_id = mobj.group(2)
 940         video_extension = 'flv'
 941
 942         # Rewrite valid but non-extractable URLs as
 943         # extractable English language /watch/ URLs
 944         if re.match(self._VPAGE_URL, url) is None:
 945             request = compat_urllib_request.Request(url)
 946             try:
 947                 webpage = compat_urllib_request.urlopen(request).read()
 948             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 949                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 950                 return
 951
 952             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 953             if mobj is None:
 954                 self._downloader.report_error(u'Unable to extract id field')
 955                 return
 956             yahoo_id = mobj.group(1)
 957
 958             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 959             if mobj is None:
 960                 self._downloader.report_error(u'Unable to extract vid field')
 961                 return
 962             yahoo_vid = mobj.group(1)
 963
 964             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 965             return self._real_extract(url, new_video=False)
 966
 967         # Retrieve video webpage to extract further information
 968         request = compat_urllib_request.Request(url)
 969         try:
 970             self.report_download_webpage(video_id)
 971             webpage = compat_urllib_request.urlopen(request).read()
 972         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 973             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 974             return
 975
 976         # Extract uploader and title from webpage
 977         self.report_extraction(video_id)
 978         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 979         if mobj is None:
 980             self._downloader.report_error(u'unable to extract video title')
 981             return
 982         video_title = mobj.group(1).decode('utf-8')
 983
 984         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 985         if mobj is None:
 986             self._downloader.report_error(u'unable to extract video uploader')
 987             return
 988         video_uploader = mobj.group(1).decode('utf-8')
 989
 990         # Extract video thumbnail
 991         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 992         if mobj is None:
 993             self._downloader.report_error(u'unable to extract video thumbnail')
 994             return
 995         video_thumbnail = mobj.group(1).decode('utf-8')
 996
 997         # Extract video description
 998         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 999         if mobj is None:
1000             self._downloader.report_error(u'unable to extract video description')
1001             return
1002         video_description = mobj.group(1).decode('utf-8')
1003         if not video_description:
1004             video_description = 'No description available.'
1005
1006         # Extract video height and width
1007         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1008         if mobj is None:
1009             self._downloader.report_error(u'unable to extract video height')
1010             return
1011         yv_video_height = mobj.group(1)
1012
1013         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1014         if mobj is None:
1015             self._downloader.report_error(u'unable to extract video width')
1016             return
1017         yv_video_width = mobj.group(1)
1018
1019         # Retrieve video playlist to extract media URL
1020         # I'm not completely sure what all these options are, but we
1021         # seem to need most of them, otherwise the server sends a 401.
1022         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1023         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1024         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1025                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1026                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1027         try:
1028             self.report_download_webpage(video_id)
1029             webpage = compat_urllib_request.urlopen(request).read()
1030         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1031             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1032             return
1033
1034         # Extract media URL from playlist XML
1035         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1036         if mobj is None:
1037             self._downloader.report_error(u'Unable to extract media URL')
1038             return
1039         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1040         video_url = unescapeHTML(video_url)
1041
1042         return [{
1043             'id':       video_id.decode('utf-8'),
1044             'url':      video_url,
1045             'uploader': video_uploader,
1046             'upload_date':  None,
1047             'title':    video_title,
1048             'ext':      video_extension.decode('utf-8'),
1049             'thumbnail':    video_thumbnail.decode('utf-8'),
1050             'description':  video_description,
1051         }]
1052
1053
1054 class VimeoIE(InfoExtractor):
1055     """Information extractor for vimeo.com."""
1056
1057     # _VALID_URL matches Vimeo URLs
1058     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1059     IE_NAME = u'vimeo'
1060
1061     def _real_extract(self, url, new_video=True):
1062         # Extract ID from URL
1063         mobj = re.match(self._VALID_URL, url)
1064         if mobj is None:
1065             self._downloader.report_error(u'Invalid URL: %s' % url)
1066             return
1067
1068         video_id = mobj.group('id')
1069         if not mobj.group('proto'):
1070             url = 'https://' + url
1071         if mobj.group('direct_link'):
1072             url = 'https://vimeo.com/' + video_id
1073
1074         # Retrieve video webpage to extract further information
1075         request = compat_urllib_request.Request(url, None, std_headers)
1076         webpage = self._download_webpage(request, video_id)
1077
1078         # Now we begin extracting as much information as we can from what we
1079         # retrieved. First we extract the information common to all extractors,
1080         # and latter we extract those that are Vimeo specific.
1081         self.report_extraction(video_id)
1082
1083         # Extract the config JSON
1084         try:
1085             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1086             config = json.loads(config)
1087         except:
1088             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1089                 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1090             else:
1091                 self._downloader.report_error(u'unable to extract info section')
1092             return
1093
1094         # Extract title
1095         video_title = config["video"]["title"]
1096
1097         # Extract uploader and uploader_id
1098         video_uploader = config["video"]["owner"]["name"]
1099         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1100
1101         # Extract video thumbnail
1102         video_thumbnail = config["video"]["thumbnail"]
1103
1104         # Extract video description
1105         video_description = get_element_by_attribute("itemprop", "description", webpage)
1106         if video_description: video_description = clean_html(video_description)
1107         else: video_description = u''
1108
1109         # Extract upload date
1110         video_upload_date = None
1111         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1112         if mobj is not None:
1113             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1114
1115         # Vimeo specific: extract request signature and timestamp
1116         sig = config['request']['signature']
1117         timestamp = config['request']['timestamp']
1118
1119         # Vimeo specific: extract video codec and quality information
1120         # First consider quality, then codecs, then take everything
1121         # TODO bind to format param
1122         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1123         files = { 'hd': [], 'sd': [], 'other': []}
1124         for codec_name, codec_extension in codecs:
1125             if codec_name in config["video"]["files"]:
1126                 if 'hd' in config["video"]["files"][codec_name]:
1127                     files['hd'].append((codec_name, codec_extension, 'hd'))
1128                 elif 'sd' in config["video"]["files"][codec_name]:
1129                     files['sd'].append((codec_name, codec_extension, 'sd'))
1130                 else:
1131                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1132
1133         for quality in ('hd', 'sd', 'other'):
1134             if len(files[quality]) > 0:
1135                 video_quality = files[quality][0][2]
1136                 video_codec = files[quality][0][0]
1137                 video_extension = files[quality][0][1]
1138                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1139                 break
1140         else:
1141             self._downloader.report_error(u'no known codec found')
1142             return
1143
1144         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1145                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1146
1147         return [{
1148             'id':       video_id,
1149             'url':      video_url,
1150             'uploader': video_uploader,
1151             'uploader_id': video_uploader_id,
1152             'upload_date':  video_upload_date,
1153             'title':    video_title,
1154             'ext':      video_extension,
1155             'thumbnail':    video_thumbnail,
1156             'description':  video_description,
1157         }]
1158
1159
1160 class ArteTvIE(InfoExtractor):
1161     """arte.tv information extractor."""
1162
1163     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1164     _LIVE_URL = r'index-[0-9]+\.html$'
1165
1166     IE_NAME = u'arte.tv'
1167
1168     def fetch_webpage(self, url):
1169         request = compat_urllib_request.Request(url)
1170         try:
1171             self.report_download_webpage(url)
1172             webpage = compat_urllib_request.urlopen(request).read()
1173         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1174             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1175             return
1176         except ValueError as err:
1177             self._downloader.report_error(u'Invalid URL: %s' % url)
1178             return
1179         return webpage
1180
1181     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182         page = self.fetch_webpage(url)
1183         mobj = re.search(regex, page, regexFlags)
1184         info = {}
1185
1186         if mobj is None:
1187             self._downloader.report_error(u'Invalid URL: %s' % url)
1188             return
1189
1190         for (i, key, err) in matchTuples:
1191             if mobj.group(i) is None:
1192                 self._downloader.report_error(err)
1193                 return
1194             else:
1195                 info[key] = mobj.group(i)
1196
1197         return info
1198
1199     def extractLiveStream(self, url):
1200         video_lang = url.split('/')[-4]
1201         info = self.grep_webpage(
1202             url,
1203             r'src="(.*?/videothek_js.*?\.js)',
1204             0,
1205             [
1206                 (1, 'url', u'Invalid URL: %s' % url)
1207             ]
1208         )
1209         http_host = url.split('/')[2]
1210         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1211         info = self.grep_webpage(
1212             next_url,
1213             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1214                 '(http://.*?\.swf).*?' +
1215                 '(rtmp://.*?)\'',
1216             re.DOTALL,
1217             [
1218                 (1, 'path',   u'could not extract video path: %s' % url),
1219                 (2, 'player', u'could not extract video player: %s' % url),
1220                 (3, 'url',    u'could not extract video url: %s' % url)
1221             ]
1222         )
1223         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1224
1225     def extractPlus7Stream(self, url):
1226         video_lang = url.split('/')[-3]
1227         info = self.grep_webpage(
1228             url,
1229             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1230             0,
1231             [
1232                 (1, 'url', u'Invalid URL: %s' % url)
1233             ]
1234         )
1235         next_url = compat_urllib_parse.unquote(info.get('url'))
1236         info = self.grep_webpage(
1237             next_url,
1238             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1239             0,
1240             [
1241                 (1, 'url', u'Could not find <video> tag: %s' % url)
1242             ]
1243         )
1244         next_url = compat_urllib_parse.unquote(info.get('url'))
1245
1246         info = self.grep_webpage(
1247             next_url,
1248             r'<video id="(.*?)".*?>.*?' +
1249                 '<name>(.*?)</name>.*?' +
1250                 '<dateVideo>(.*?)</dateVideo>.*?' +
1251                 '<url quality="hd">(.*?)</url>',
1252             re.DOTALL,
1253             [
1254                 (1, 'id',    u'could not extract video id: %s' % url),
1255                 (2, 'title', u'could not extract video title: %s' % url),
1256                 (3, 'date',  u'could not extract video date: %s' % url),
1257                 (4, 'url',   u'could not extract video url: %s' % url)
1258             ]
1259         )
1260
1261         return {
1262             'id':           info.get('id'),
1263             'url':          compat_urllib_parse.unquote(info.get('url')),
1264             'uploader':     u'arte.tv',
1265             'upload_date':  info.get('date'),
1266             'title':        info.get('title').decode('utf-8'),
1267             'ext':          u'mp4',
1268             'format':       u'NA',
1269             'player_url':   None,
1270         }
1271
1272     def _real_extract(self, url):
1273         video_id = url.split('/')[-1]
1274         self.report_extraction(video_id)
1275
1276         if re.search(self._LIVE_URL, video_id) is not None:
1277             self.extractLiveStream(url)
1278             return
1279         else:
1280             info = self.extractPlus7Stream(url)
1281
1282         return [info]
1283
1284
1285 class GenericIE(InfoExtractor):
1286     """Generic last-resort information extractor."""
1287
1288     _VALID_URL = r'.*'
1289     IE_NAME = u'generic'
1290
1291     def report_download_webpage(self, video_id):
1292         """Report webpage download."""
1293         if not self._downloader.params.get('test', False):
1294             self._downloader.report_warning(u'Falling back on generic information extractor.')
1295         super(GenericIE, self).report_download_webpage(video_id)
1296
1297     def report_following_redirect(self, new_url):
1298         """Report information extraction."""
1299         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1300
1301     def _test_redirect(self, url):
1302         """Check if it is a redirect, like url shorteners, in case return the new url."""
1303         class HeadRequest(compat_urllib_request.Request):
1304             def get_method(self):
1305                 return "HEAD"
1306
1307         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1308             """
1309             Subclass the HTTPRedirectHandler to make it use our
1310             HeadRequest also on the redirected URL
1311             """
1312             def redirect_request(self, req, fp, code, msg, headers, newurl):
1313                 if code in (301, 302, 303, 307):
1314                     newurl = newurl.replace(' ', '%20')
1315                     newheaders = dict((k,v) for k,v in req.headers.items()
1316                                       if k.lower() not in ("content-length", "content-type"))
1317                     return HeadRequest(newurl,
1318                                        headers=newheaders,
1319                                        origin_req_host=req.get_origin_req_host(),
1320                                        unverifiable=True)
1321                 else:
1322                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1323
1324         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1325             """
1326             Fallback to GET if HEAD is not allowed (405 HTTP error)
1327             """
1328             def http_error_405(self, req, fp, code, msg, headers):
1329                 fp.read()
1330                 fp.close()
1331
1332                 newheaders = dict((k,v) for k,v in req.headers.items()
1333                                   if k.lower() not in ("content-length", "content-type"))
1334                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1335                                                  headers=newheaders,
1336                                                  origin_req_host=req.get_origin_req_host(),
1337                                                  unverifiable=True))
1338
1339         # Build our opener
1340         opener = compat_urllib_request.OpenerDirector()
1341         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1342                         HTTPMethodFallback, HEADRedirectHandler,
1343                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1344             opener.add_handler(handler())
1345
1346         response = opener.open(HeadRequest(url))
1347         new_url = response.geturl()
1348
1349         if url == new_url:
1350             return False
1351
1352         self.report_following_redirect(new_url)
1353         return new_url
1354
1355     def _real_extract(self, url):
1356         new_url = self._test_redirect(url)
1357         if new_url: return [self.url_result(new_url)]
1358
1359         video_id = url.split('/')[-1]
1360         try:
1361             webpage = self._download_webpage(url, video_id)
1362         except ValueError as err:
1363             # since this is the last-resort InfoExtractor, if
1364             # this error is thrown, it'll be thrown here
1365             self._downloader.report_error(u'Invalid URL: %s' % url)
1366             return
1367
1368         self.report_extraction(video_id)
1369         # Start with something easy: JW Player in SWFObject
1370         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1371         if mobj is None:
1372             # Broaden the search a little bit
1373             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1374         if mobj is None:
1375             # Broaden the search a little bit: JWPlayer JS loader
1376             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1377         if mobj is None:
1378             self._downloader.report_error(u'Invalid URL: %s' % url)
1379             return
1380
1381         # It's possible that one of the regexes
1382         # matched, but returned an empty group:
1383         if mobj.group(1) is None:
1384             self._downloader.report_error(u'Invalid URL: %s' % url)
1385             return
1386
1387         video_url = compat_urllib_parse.unquote(mobj.group(1))
1388         video_id = os.path.basename(video_url)
1389
1390         # here's a fun little line of code for you:
1391         video_extension = os.path.splitext(video_id)[1][1:]
1392         video_id = os.path.splitext(video_id)[0]
1393
1394         # it's tempting to parse this further, but you would
1395         # have to take into account all the variations like
1396         #   Video Title - Site Name
1397         #   Site Name | Video Title
1398         #   Video Title - Tagline | Site Name
1399         # and so on and so forth; it's just not practical
1400         mobj = re.search(r'<title>(.*)</title>', webpage)
1401         if mobj is None:
1402             self._downloader.report_error(u'unable to extract title')
1403             return
1404         video_title = mobj.group(1)
1405
1406         # video uploader is domain name
1407         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1408         if mobj is None:
1409             self._downloader.report_error(u'unable to extract title')
1410             return
1411         video_uploader = mobj.group(1)
1412
1413         return [{
1414             'id':       video_id,
1415             'url':      video_url,
1416             'uploader': video_uploader,
1417             'upload_date':  None,
1418             'title':    video_title,
1419             'ext':      video_extension,
1420         }]
1421
1422
1423 class YoutubeSearchIE(InfoExtractor):
1424     """Information Extractor for YouTube search queries."""
1425     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1426     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1427     _max_youtube_results = 1000
1428     IE_NAME = u'youtube:search'
1429
1430     def report_download_page(self, query, pagenum):
1431         """Report attempt to download search page with given number."""
1432         query = query.decode(preferredencoding())
1433         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1434
1435     def _real_extract(self, query):
1436         mobj = re.match(self._VALID_URL, query)
1437         if mobj is None:
1438             self._downloader.report_error(u'invalid search query "%s"' % query)
1439             return
1440
1441         prefix, query = query.split(':')
1442         prefix = prefix[8:]
1443         query = query.encode('utf-8')
1444         if prefix == '':
1445             return self._get_n_results(query, 1)
1446         elif prefix == 'all':
1447             self._get_n_results(query, self._max_youtube_results)
1448         else:
1449             try:
1450                 n = int(prefix)
1451                 if n <= 0:
1452                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1453                     return
1454                 elif n > self._max_youtube_results:
1455                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1456                     n = self._max_youtube_results
1457                 return self._get_n_results(query, n)
1458             except ValueError: # parsing prefix as integer fails
1459                 return self._get_n_results(query, 1)
1460
1461     def _get_n_results(self, query, n):
1462         """Get a specified number of results for a query"""
1463
1464         video_ids = []
1465         pagenum = 0
1466         limit = n
1467
1468         while (50 * pagenum) < limit:
1469             self.report_download_page(query, pagenum+1)
1470             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1471             request = compat_urllib_request.Request(result_url)
1472             try:
1473                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1474             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1475                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1476                 return
1477             api_response = json.loads(data)['data']
1478
1479             if not 'items' in api_response:
1480                 self._downloader.report_error(u'[youtube] No video results')
1481                 return
1482
1483             new_ids = list(video['id'] for video in api_response['items'])
1484             video_ids += new_ids
1485
1486             limit = min(n, api_response['totalItems'])
1487             pagenum += 1
1488
1489         if len(video_ids) > n:
1490             video_ids = video_ids[:n]
1491         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1492         return videos
1493
1494
1495 class GoogleSearchIE(InfoExtractor):
1496     """Information Extractor for Google Video search queries."""
1497     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1498     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1499     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1500     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1501     _max_google_results = 1000
1502     IE_NAME = u'video.google:search'
1503
1504     def report_download_page(self, query, pagenum):
1505         """Report attempt to download playlist page with given number."""
1506         query = query.decode(preferredencoding())
1507         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1508
1509     def _real_extract(self, query):
1510         mobj = re.match(self._VALID_URL, query)
1511         if mobj is None:
1512             self._downloader.report_error(u'invalid search query "%s"' % query)
1513             return
1514
1515         prefix, query = query.split(':')
1516         prefix = prefix[8:]
1517         query = query.encode('utf-8')
1518         if prefix == '':
1519             self._download_n_results(query, 1)
1520             return
1521         elif prefix == 'all':
1522             self._download_n_results(query, self._max_google_results)
1523             return
1524         else:
1525             try:
1526                 n = int(prefix)
1527                 if n <= 0:
1528                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1529                     return
1530                 elif n > self._max_google_results:
1531                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1532                     n = self._max_google_results
1533                 self._download_n_results(query, n)
1534                 return
1535             except ValueError: # parsing prefix as integer fails
1536                 self._download_n_results(query, 1)
1537                 return
1538
1539     def _download_n_results(self, query, n):
1540         """Downloads a specified number of results for a query"""
1541
1542         video_ids = []
1543         pagenum = 0
1544
1545         while True:
1546             self.report_download_page(query, pagenum)
1547             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1548             request = compat_urllib_request.Request(result_url)
1549             try:
1550                 page = compat_urllib_request.urlopen(request).read()
1551             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1552                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1553                 return
1554
1555             # Extract video identifiers
1556             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1557                 video_id = mobj.group(1)
1558                 if video_id not in video_ids:
1559                     video_ids.append(video_id)
1560                     if len(video_ids) == n:
1561                         # Specified n videos reached
1562                         for id in video_ids:
1563                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1564                         return
1565
1566             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1567                 for id in video_ids:
1568                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1569                 return
1570
1571             pagenum = pagenum + 1
1572
1573
1574 class YahooSearchIE(InfoExtractor):
1575     """Information Extractor for Yahoo! Video search queries."""
1576
1577     _WORKING = False
1578     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1579     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1580     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1581     _MORE_PAGES_INDICATOR = r'\s*Next'
1582     _max_yahoo_results = 1000
1583     IE_NAME = u'video.yahoo:search'
1584
1585     def report_download_page(self, query, pagenum):
1586         """Report attempt to download playlist page with given number."""
1587         query = query.decode(preferredencoding())
1588         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1589
1590     def _real_extract(self, query):
1591         mobj = re.match(self._VALID_URL, query)
1592         if mobj is None:
1593             self._downloader.report_error(u'invalid search query "%s"' % query)
1594             return
1595
1596         prefix, query = query.split(':')
1597         prefix = prefix[8:]
1598         query = query.encode('utf-8')
1599         if prefix == '':
1600             self._download_n_results(query, 1)
1601             return
1602         elif prefix == 'all':
1603             self._download_n_results(query, self._max_yahoo_results)
1604             return
1605         else:
1606             try:
1607                 n = int(prefix)
1608                 if n <= 0:
1609                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1610                     return
1611                 elif n > self._max_yahoo_results:
1612                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1613                     n = self._max_yahoo_results
1614                 self._download_n_results(query, n)
1615                 return
1616             except ValueError: # parsing prefix as integer fails
1617                 self._download_n_results(query, 1)
1618                 return
1619
1620     def _download_n_results(self, query, n):
1621         """Downloads a specified number of results for a query"""
1622
1623         video_ids = []
1624         already_seen = set()
1625         pagenum = 1
1626
1627         while True:
1628             self.report_download_page(query, pagenum)
1629             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1630             request = compat_urllib_request.Request(result_url)
1631             try:
1632                 page = compat_urllib_request.urlopen(request).read()
1633             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1634                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1635                 return
1636
1637             # Extract video identifiers
1638             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1639                 video_id = mobj.group(1)
1640                 if video_id not in already_seen:
1641                     video_ids.append(video_id)
1642                     already_seen.add(video_id)
1643                     if len(video_ids) == n:
1644                         # Specified n videos reached
1645                         for id in video_ids:
1646                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1647                         return
1648
1649             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1650                 for id in video_ids:
1651                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1652                 return
1653
1654             pagenum = pagenum + 1
1655
1656
1657 class YoutubePlaylistIE(InfoExtractor):
1658     """Information Extractor for YouTube playlists."""
1659
1660     _VALID_URL = r"""(?:
1661                         (?:https?://)?
1662                         (?:\w+\.)?
1663                         youtube\.com/
1664                         (?:
1665                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1666                            \? (?:.*?&)*? (?:p|a|list)=
1667                         |  p/
1668                         )
1669                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1670                         .*
1671                      |
1672                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1673                      )"""
1674     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1675     _MAX_RESULTS = 50
1676     IE_NAME = u'youtube:playlist'
1677
1678     @classmethod
1679     def suitable(cls, url):
1680         """Receives a URL and returns True if suitable for this IE."""
1681         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1682
1683     def _real_extract(self, url):
1684         # Extract playlist id
1685         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1686         if mobj is None:
1687             self._downloader.report_error(u'invalid url: %s' % url)
1688             return
1689
1690         # Download playlist videos from API
1691         playlist_id = mobj.group(1) or mobj.group(2)
1692         page_num = 1
1693         videos = []
1694
1695         while True:
1696             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1697             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1698
1699             try:
1700                 response = json.loads(page)
1701             except ValueError as err:
1702                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1703                 return
1704
1705             if 'feed' not in response:
1706                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1707                 return
1708             playlist_title = response['feed']['title']['$t']
1709             if 'entry' not in response['feed']:
1710                 # Number of videos is a multiple of self._MAX_RESULTS
1711                 break
1712
1713             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1714                         for entry in response['feed']['entry']
1715                         if 'content' in entry ]
1716
1717             if len(response['feed']['entry']) < self._MAX_RESULTS:
1718                 break
1719             page_num += 1
1720
1721         videos = [v[1] for v in sorted(videos)]
1722
1723         url_results = [self.url_result(url, 'Youtube') for url in videos]
1724         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1725
1726
1727 class YoutubeChannelIE(InfoExtractor):
1728     """Information Extractor for YouTube channels."""
1729
1730     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1731     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1732     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1733     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1734     IE_NAME = u'youtube:channel'
1735
1736     def extract_videos_from_page(self, page):
1737         ids_in_page = []
1738         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1739             if mobj.group(1) not in ids_in_page:
1740                 ids_in_page.append(mobj.group(1))
1741         return ids_in_page
1742
1743     def _real_extract(self, url):
1744         # Extract channel id
1745         mobj = re.match(self._VALID_URL, url)
1746         if mobj is None:
1747             self._downloader.report_error(u'invalid url: %s' % url)
1748             return
1749
1750         # Download channel page
1751         channel_id = mobj.group(1)
1752         video_ids = []
1753         pagenum = 1
1754
1755         url = self._TEMPLATE_URL % (channel_id, pagenum)
1756         page = self._download_webpage(url, channel_id,
1757                                       u'Downloading page #%s' % pagenum)
1758
1759         # Extract video identifiers
1760         ids_in_page = self.extract_videos_from_page(page)
1761         video_ids.extend(ids_in_page)
1762
1763         # Download any subsequent channel pages using the json-based channel_ajax query
1764         if self._MORE_PAGES_INDICATOR in page:
1765             while True:
1766                 pagenum = pagenum + 1
1767
1768                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1769                 page = self._download_webpage(url, channel_id,
1770                                               u'Downloading page #%s' % pagenum)
1771
1772                 page = json.loads(page)
1773
1774                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1775                 video_ids.extend(ids_in_page)
1776
1777                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1778                     break
1779
1780         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1781
1782         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1783         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1784         return [self.playlist_result(url_entries, channel_id)]
1785
1786
1787 class YoutubeUserIE(InfoExtractor):
1788     """Information Extractor for YouTube users."""
1789
1790     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1791     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1792     _GDATA_PAGE_SIZE = 50
1793     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1794     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1795     IE_NAME = u'youtube:user'
1796
1797     def _real_extract(self, url):
1798         # Extract username
1799         mobj = re.match(self._VALID_URL, url)
1800         if mobj is None:
1801             self._downloader.report_error(u'invalid url: %s' % url)
1802             return
1803
1804         username = mobj.group(1)
1805
1806         # Download video ids using YouTube Data API. Result size per
1807         # query is limited (currently to 50 videos) so we need to query
1808         # page by page until there are no video ids - it means we got
1809         # all of them.
1810
1811         video_ids = []
1812         pagenum = 0
1813
1814         while True:
1815             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1816
1817             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1818             page = self._download_webpage(gdata_url, username,
1819                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1820
1821             # Extract video identifiers
1822             ids_in_page = []
1823
1824             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1825                 if mobj.group(1) not in ids_in_page:
1826                     ids_in_page.append(mobj.group(1))
1827
1828             video_ids.extend(ids_in_page)
1829
1830             # A little optimization - if current page is not
1831             # "full", ie. does not contain PAGE_SIZE video ids then
1832             # we can assume that this page is the last one - there
1833             # are no more ids on further pages - no need to query
1834             # again.
1835
1836             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1837                 break
1838
1839             pagenum += 1
1840
1841         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1842         url_results = [self.url_result(url, 'Youtube') for url in urls]
1843         return [self.playlist_result(url_results, playlist_title = username)]
1844
1845
1846 class BlipTVUserIE(InfoExtractor):
1847     """Information Extractor for blip.tv users."""
1848
1849     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1850     _PAGE_SIZE = 12
1851     IE_NAME = u'blip.tv:user'
1852
1853     def _real_extract(self, url):
1854         # Extract username
1855         mobj = re.match(self._VALID_URL, url)
1856         if mobj is None:
1857             self._downloader.report_error(u'invalid url: %s' % url)
1858             return
1859
1860         username = mobj.group(1)
1861
1862         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1863
1864         page = self._download_webpage(url, username, u'Downloading user page')
1865         mobj = re.search(r'data-users-id="([^"]+)"', page)
1866         page_base = page_base % mobj.group(1)
1867
1868
1869         # Download video ids using BlipTV Ajax calls. Result size per
1870         # query is limited (currently to 12 videos) so we need to query
1871         # page by page until there are no video ids - it means we got
1872         # all of them.
1873
1874         video_ids = []
1875         pagenum = 1
1876
1877         while True:
1878             url = page_base + "&page=" + str(pagenum)
1879             page = self._download_webpage(url, username,
1880                                           u'Downloading video ids from page %d' % pagenum)
1881
1882             # Extract video identifiers
1883             ids_in_page = []
1884
1885             for mobj in re.finditer(r'href="/([^"]+)"', page):
1886                 if mobj.group(1) not in ids_in_page:
1887                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1888
1889             video_ids.extend(ids_in_page)
1890
1891             # A little optimization - if current page is not
1892             # "full", ie. does not contain PAGE_SIZE video ids then
1893             # we can assume that this page is the last one - there
1894             # are no more ids on further pages - no need to query
1895             # again.
1896
1897             if len(ids_in_page) < self._PAGE_SIZE:
1898                 break
1899
1900             pagenum += 1
1901
1902         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1903         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1904         return [self.playlist_result(url_entries, playlist_title = username)]
1905
1906
1907 class DepositFilesIE(InfoExtractor):
1908     """Information extractor for depositfiles.com"""
1909
1910     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1911
1912     def _real_extract(self, url):
1913         file_id = url.split('/')[-1]
1914         # Rebuild url in english locale
1915         url = 'http://depositfiles.com/en/files/' + file_id
1916
1917         # Retrieve file webpage with 'Free download' button pressed
1918         free_download_indication = { 'gateway_result' : '1' }
1919         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1920         try:
1921             self.report_download_webpage(file_id)
1922             webpage = compat_urllib_request.urlopen(request).read()
1923         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1924             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1925             return
1926
1927         # Search for the real file URL
1928         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1929         if (mobj is None) or (mobj.group(1) is None):
1930             # Try to figure out reason of the error.
1931             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1932             if (mobj is not None) and (mobj.group(1) is not None):
1933                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1934                 self._downloader.report_error(u'%s' % restriction_message)
1935             else:
1936                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1937             return
1938
1939         file_url = mobj.group(1)
1940         file_extension = os.path.splitext(file_url)[1][1:]
1941
1942         # Search for file title
1943         mobj = re.search(r'<b title="(.*?)">', webpage)
1944         if mobj is None:
1945             self._downloader.report_error(u'unable to extract title')
1946             return
1947         file_title = mobj.group(1).decode('utf-8')
1948
1949         return [{
1950             'id':       file_id.decode('utf-8'),
1951             'url':      file_url.decode('utf-8'),
1952             'uploader': None,
1953             'upload_date':  None,
1954             'title':    file_title,
1955             'ext':      file_extension.decode('utf-8'),
1956         }]
1957
1958
1959 class FacebookIE(InfoExtractor):
1960     """Information Extractor for Facebook"""
1961
1962     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1963     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1964     _NETRC_MACHINE = 'facebook'
1965     IE_NAME = u'facebook'
1966
1967     def report_login(self):
1968         """Report attempt to log in."""
1969         self.to_screen(u'Logging in')
1970
1971     def _real_initialize(self):
1972         if self._downloader is None:
1973             return
1974
1975         useremail = None
1976         password = None
1977         downloader_params = self._downloader.params
1978
1979         # Attempt to use provided username and password or .netrc data
1980         if downloader_params.get('username', None) is not None:
1981             useremail = downloader_params['username']
1982             password = downloader_params['password']
1983         elif downloader_params.get('usenetrc', False):
1984             try:
1985                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1986                 if info is not None:
1987                     useremail = info[0]
1988                     password = info[2]
1989                 else:
1990                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1991             except (IOError, netrc.NetrcParseError) as err:
1992                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1993                 return
1994
1995         if useremail is None:
1996             return
1997
1998         # Log in
1999         login_form = {
2000             'email': useremail,
2001             'pass': password,
2002             'login': 'Log+In'
2003             }
2004         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2005         try:
2006             self.report_login()
2007             login_results = compat_urllib_request.urlopen(request).read()
2008             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2009                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2010                 return
2011         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2012             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2013             return
2014
2015     def _real_extract(self, url):
2016         mobj = re.match(self._VALID_URL, url)
2017         if mobj is None:
2018             self._downloader.report_error(u'invalid URL: %s' % url)
2019             return
2020         video_id = mobj.group('ID')
2021
2022         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2023         webpage = self._download_webpage(url, video_id)
2024
2025         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2026         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2027         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2028         if not m:
2029             raise ExtractorError(u'Cannot parse data')
2030         data = dict(json.loads(m.group(1)))
2031         params_raw = compat_urllib_parse.unquote(data['params'])
2032         params = json.loads(params_raw)
2033         video_data = params['video_data'][0]
2034         video_url = video_data.get('hd_src')
2035         if not video_url:
2036             video_url = video_data['sd_src']
2037         if not video_url:
2038             raise ExtractorError(u'Cannot find video URL')
2039         video_duration = int(video_data['video_duration'])
2040         thumbnail = video_data['thumbnail_src']
2041
2042         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2043         if not m:
2044             raise ExtractorError(u'Cannot find title in webpage')
2045         video_title = unescapeHTML(m.group(1))
2046
2047         info = {
2048             'id': video_id,
2049             'title': video_title,
2050             'url': video_url,
2051             'ext': 'mp4',
2052             'duration': video_duration,
2053             'thumbnail': thumbnail,
2054         }
2055         return [info]
2056
2057
2058 class BlipTVIE(InfoExtractor):
2059     """Information extractor for blip.tv"""
2060
2061     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2062     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2063     IE_NAME = u'blip.tv'
2064
2065     def report_direct_download(self, title):
2066         """Report information extraction."""
2067         self.to_screen(u'%s: Direct download detected' % title)
2068
2069     def _real_extract(self, url):
2070         mobj = re.match(self._VALID_URL, url)
2071         if mobj is None:
2072             self._downloader.report_error(u'invalid URL: %s' % url)
2073             return
2074
2075         urlp = compat_urllib_parse_urlparse(url)
2076         if urlp.path.startswith('/play/'):
2077             request = compat_urllib_request.Request(url)
2078             response = compat_urllib_request.urlopen(request)
2079             redirecturl = response.geturl()
2080             rurlp = compat_urllib_parse_urlparse(redirecturl)
2081             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2082             url = 'http://blip.tv/a/a-' + file_id
2083             return self._real_extract(url)
2084
2085
2086         if '?' in url:
2087             cchar = '&'
2088         else:
2089             cchar = '?'
2090         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2091         request = compat_urllib_request.Request(json_url)
2092         request.add_header('User-Agent', 'iTunes/10.6.1')
2093         self.report_extraction(mobj.group(1))
2094         info = None
2095         try:
2096             urlh = compat_urllib_request.urlopen(request)
2097             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2098                 basename = url.split('/')[-1]
2099                 title,ext = os.path.splitext(basename)
2100                 title = title.decode('UTF-8')
2101                 ext = ext.replace('.', '')
2102                 self.report_direct_download(title)
2103                 info = {
2104                     'id': title,
2105                     'url': url,
2106                     'uploader': None,
2107                     'upload_date': None,
2108                     'title': title,
2109                     'ext': ext,
2110                     'urlhandle': urlh
2111                 }
2112         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2113             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2114         if info is None: # Regular URL
2115             try:
2116                 json_code_bytes = urlh.read()
2117                 json_code = json_code_bytes.decode('utf-8')
2118             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2119                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2120                 return
2121
2122             try:
2123                 json_data = json.loads(json_code)
2124                 if 'Post' in json_data:
2125                     data = json_data['Post']
2126                 else:
2127                     data = json_data
2128
2129                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2130                 video_url = data['media']['url']
2131                 umobj = re.match(self._URL_EXT, video_url)
2132                 if umobj is None:
2133                     raise ValueError('Can not determine filename extension')
2134                 ext = umobj.group(1)
2135
2136                 info = {
2137                     'id': data['item_id'],
2138                     'url': video_url,
2139                     'uploader': data['display_name'],
2140                     'upload_date': upload_date,
2141                     'title': data['title'],
2142                     'ext': ext,
2143                     'format': data['media']['mimeType'],
2144                     'thumbnail': data['thumbnailUrl'],
2145                     'description': data['description'],
2146                     'player_url': data['embedUrl'],
2147                     'user_agent': 'iTunes/10.6.1',
2148                 }
2149             except (ValueError,KeyError) as err:
2150                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2151                 return
2152
2153         return [info]
2154
2155
2156 class MyVideoIE(InfoExtractor):
2157     """Information Extractor for myvideo.de."""
2158
2159     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2160     IE_NAME = u'myvideo'
2161
2162     def _real_extract(self,url):
2163         mobj = re.match(self._VALID_URL, url)
2164         if mobj is None:
2165             self._download.report_error(u'invalid URL: %s' % url)
2166             return
2167
2168         video_id = mobj.group(1)
2169
2170         # Get video webpage
2171         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2172         webpage = self._download_webpage(webpage_url, video_id)
2173
2174         self.report_extraction(video_id)
2175         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2176                  webpage)
2177         if mobj is None:
2178             self._downloader.report_error(u'unable to extract media URL')
2179             return
2180         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2181
2182         mobj = re.search('<title>([^<]+)</title>', webpage)
2183         if mobj is None:
2184             self._downloader.report_error(u'unable to extract title')
2185             return
2186
2187         video_title = mobj.group(1)
2188
2189         return [{
2190             'id':       video_id,
2191             'url':      video_url,
2192             'uploader': None,
2193             'upload_date':  None,
2194             'title':    video_title,
2195             'ext':      u'flv',
2196         }]
2197
2198 class ComedyCentralIE(InfoExtractor):
2199     """Information extractor for The Daily Show and Colbert Report """
2200
2201     # urls can be abbreviations like :thedailyshow or :colbert
2202     # urls for episodes like:
2203     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2204     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2205     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2206     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2207                       |(https?://)?(www\.)?
2208                           (?P<showname>thedailyshow|colbertnation)\.com/
2209                          (full-episodes/(?P<episode>.*)|
2210                           (?P<clip>
2211                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2212                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2213                      $"""
2214
2215     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2216
2217     _video_extensions = {
2218         '3500': 'mp4',
2219         '2200': 'mp4',
2220         '1700': 'mp4',
2221         '1200': 'mp4',
2222         '750': 'mp4',
2223         '400': 'mp4',
2224     }
2225     _video_dimensions = {
2226         '3500': '1280x720',
2227         '2200': '960x540',
2228         '1700': '768x432',
2229         '1200': '640x360',
2230         '750': '512x288',
2231         '400': '384x216',
2232     }
2233
2234     @classmethod
2235     def suitable(cls, url):
2236         """Receives a URL and returns True if suitable for this IE."""
2237         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2238
2239     def _print_formats(self, formats):
2240         print('Available formats:')
2241         for x in formats:
2242             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2243
2244
2245     def _real_extract(self, url):
2246         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2247         if mobj is None:
2248             self._downloader.report_error(u'invalid URL: %s' % url)
2249             return
2250
2251         if mobj.group('shortname'):
2252             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2253                 url = u'http://www.thedailyshow.com/full-episodes/'
2254             else:
2255                 url = u'http://www.colbertnation.com/full-episodes/'
2256             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2257             assert mobj is not None
2258
2259         if mobj.group('clip'):
2260             if mobj.group('showname') == 'thedailyshow':
2261                 epTitle = mobj.group('tdstitle')
2262             else:
2263                 epTitle = mobj.group('cntitle')
2264             dlNewest = False
2265         else:
2266             dlNewest = not mobj.group('episode')
2267             if dlNewest:
2268                 epTitle = mobj.group('showname')
2269             else:
2270                 epTitle = mobj.group('episode')
2271
2272         self.report_extraction(epTitle)
2273         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2274         if dlNewest:
2275             url = htmlHandle.geturl()
2276             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2277             if mobj is None:
2278                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2279             if mobj.group('episode') == '':
2280                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2281             epTitle = mobj.group('episode')
2282
2283         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2284
2285         if len(mMovieParams) == 0:
2286             # The Colbert Report embeds the information in a without
2287             # a URL prefix; so extract the alternate reference
2288             # and then add the URL prefix manually.
2289
2290             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2291             if len(altMovieParams) == 0:
2292                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2293             else:
2294                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2295
2296         uri = mMovieParams[0][1]
2297         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2298         indexXml = self._download_webpage(indexUrl, epTitle,
2299                                           u'Downloading show index',
2300                                           u'unable to download episode index')
2301
2302         results = []
2303
2304         idoc = xml.etree.ElementTree.fromstring(indexXml)
2305         itemEls = idoc.findall('.//item')
2306         for partNum,itemEl in enumerate(itemEls):
2307             mediaId = itemEl.findall('./guid')[0].text
2308             shortMediaId = mediaId.split(':')[-1]
2309             showId = mediaId.split(':')[-2].replace('.com', '')
2310             officialTitle = itemEl.findall('./title')[0].text
2311             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2312
2313             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2314                         compat_urllib_parse.urlencode({'uri': mediaId}))
2315             configXml = self._download_webpage(configUrl, epTitle,
2316                                                u'Downloading configuration for %s' % shortMediaId)
2317
2318             cdoc = xml.etree.ElementTree.fromstring(configXml)
2319             turls = []
2320             for rendition in cdoc.findall('.//rendition'):
2321                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2322                 turls.append(finfo)
2323
2324             if len(turls) == 0:
2325                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2326                 continue
2327
2328             if self._downloader.params.get('listformats', None):
2329                 self._print_formats([i[0] for i in turls])
2330                 return
2331
2332             # For now, just pick the highest bitrate
2333             format,rtmp_video_url = turls[-1]
2334
2335             # Get the format arg from the arg stream
2336             req_format = self._downloader.params.get('format', None)
2337
2338             # Select format if we can find one
2339             for f,v in turls:
2340                 if f == req_format:
2341                     format, rtmp_video_url = f, v
2342                     break
2343
2344             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2345             if not m:
2346                 raise ExtractorError(u'Cannot transform RTMP url')
2347             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2348             video_url = base + m.group('finalid')
2349
2350             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2351             info = {
2352                 'id': shortMediaId,
2353                 'url': video_url,
2354                 'uploader': showId,
2355                 'upload_date': officialDate,
2356                 'title': effTitle,
2357                 'ext': 'mp4',
2358                 'format': format,
2359                 'thumbnail': None,
2360                 'description': officialTitle,
2361             }
2362             results.append(info)
2363
2364         return results
2365
2366
2367 class EscapistIE(InfoExtractor):
2368     """Information extractor for The Escapist """
2369
2370     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2371     IE_NAME = u'escapist'
2372
2373     def _real_extract(self, url):
2374         mobj = re.match(self._VALID_URL, url)
2375         if mobj is None:
2376             self._downloader.report_error(u'invalid URL: %s' % url)
2377             return
2378         showName = mobj.group('showname')
2379         videoId = mobj.group('episode')
2380
2381         self.report_extraction(showName)
2382         webPage = self._download_webpage(url, showName)
2383
2384         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2385         description = unescapeHTML(descMatch.group(1))
2386         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2387         imgUrl = unescapeHTML(imgMatch.group(1))
2388         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2389         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2390         configUrlMatch = re.search('config=(.*)$', playerUrl)
2391         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2392
2393         configJSON = self._download_webpage(configUrl, showName,
2394                                             u'Downloading configuration',
2395                                             u'unable to download configuration')
2396
2397         # Technically, it's JavaScript, not JSON
2398         configJSON = configJSON.replace("'", '"')
2399
2400         try:
2401             config = json.loads(configJSON)
2402         except (ValueError,) as err:
2403             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2404             return
2405
2406         playlist = config['playlist']
2407         videoUrl = playlist[1]['url']
2408
2409         info = {
2410             'id': videoId,
2411             'url': videoUrl,
2412             'uploader': showName,
2413             'upload_date': None,
2414             'title': showName,
2415             'ext': 'mp4',
2416             'thumbnail': imgUrl,
2417             'description': description,
2418             'player_url': playerUrl,
2419         }
2420
2421         return [info]
2422
2423 class CollegeHumorIE(InfoExtractor):
2424     """Information extractor for collegehumor.com"""
2425
2426     _WORKING = False
2427     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2428     IE_NAME = u'collegehumor'
2429
2430     def report_manifest(self, video_id):
2431         """Report information extraction."""
2432         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2433
2434     def _real_extract(self, url):
2435         mobj = re.match(self._VALID_URL, url)
2436         if mobj is None:
2437             self._downloader.report_error(u'invalid URL: %s' % url)
2438             return
2439         video_id = mobj.group('videoid')
2440
2441         info = {
2442             'id': video_id,
2443             'uploader': None,
2444             'upload_date': None,
2445         }
2446
2447         self.report_extraction(video_id)
2448         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2449         try:
2450             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2451         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2452             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2453             return
2454
2455         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2456         try:
2457             videoNode = mdoc.findall('./video')[0]
2458             info['description'] = videoNode.findall('./description')[0].text
2459             info['title'] = videoNode.findall('./caption')[0].text
2460             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2461             manifest_url = videoNode.findall('./file')[0].text
2462         except IndexError:
2463             self._downloader.report_error(u'Invalid metadata XML file')
2464             return
2465
2466         manifest_url += '?hdcore=2.10.3'
2467         self.report_manifest(video_id)
2468         try:
2469             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2470         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2471             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2472             return
2473
2474         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2475         try:
2476             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2477             node_id = media_node.attrib['url']
2478             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2479         except IndexError as err:
2480             self._downloader.report_error(u'Invalid manifest file')
2481             return
2482
2483         url_pr = compat_urllib_parse_urlparse(manifest_url)
2484         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2485
2486         info['url'] = url
2487         info['ext'] = 'f4f'
2488         return [info]
2489
2490
2491 class XVideosIE(InfoExtractor):
2492     """Information extractor for xvideos.com"""
2493
2494     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2495     IE_NAME = u'xvideos'
2496
2497     def _real_extract(self, url):
2498         mobj = re.match(self._VALID_URL, url)
2499         if mobj is None:
2500             self._downloader.report_error(u'invalid URL: %s' % url)
2501             return
2502         video_id = mobj.group(1)
2503
2504         webpage = self._download_webpage(url, video_id)
2505
2506         self.report_extraction(video_id)
2507
2508
2509         # Extract video URL
2510         mobj = re.search(r'flv_url=(.+?)&', webpage)
2511         if mobj is None:
2512             self._downloader.report_error(u'unable to extract video url')
2513             return
2514         video_url = compat_urllib_parse.unquote(mobj.group(1))
2515
2516
2517         # Extract title
2518         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2519         if mobj is None:
2520             self._downloader.report_error(u'unable to extract video title')
2521             return
2522         video_title = mobj.group(1)
2523
2524
2525         # Extract video thumbnail
2526         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2527         if mobj is None:
2528             self._downloader.report_error(u'unable to extract video thumbnail')
2529             return
2530         video_thumbnail = mobj.group(0)
2531
2532         info = {
2533             'id': video_id,
2534             'url': video_url,
2535             'uploader': None,
2536             'upload_date': None,
2537             'title': video_title,
2538             'ext': 'flv',
2539             'thumbnail': video_thumbnail,
2540             'description': None,
2541         }
2542
2543         return [info]
2544
2545
2546 class SoundcloudIE(InfoExtractor):
2547     """Information extractor for soundcloud.com
2548        To access the media, the uid of the song and a stream token
2549        must be extracted from the page source and the script must make
2550        a request to media.soundcloud.com/crossdomain.xml. Then
2551        the media can be grabbed by requesting from an url composed
2552        of the stream token and uid
2553      """
2554
2555     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2556     IE_NAME = u'soundcloud'
2557
2558     def report_resolve(self, video_id):
2559         """Report information extraction."""
2560         self.to_screen(u'%s: Resolving id' % video_id)
2561
2562     def _real_extract(self, url):
2563         mobj = re.match(self._VALID_URL, url)
2564         if mobj is None:
2565             self._downloader.report_error(u'invalid URL: %s' % url)
2566             return
2567
2568         # extract uploader (which is in the url)
2569         uploader = mobj.group(1)
2570         # extract simple title (uploader + slug of song title)
2571         slug_title =  mobj.group(2)
2572         simple_title = uploader + u'-' + slug_title
2573         full_title = '%s/%s' % (uploader, slug_title)
2574
2575         self.report_resolve(full_title)
2576
2577         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2578         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2579         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2580
2581         info = json.loads(info_json)
2582         video_id = info['id']
2583         self.report_extraction(full_title)
2584
2585         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2586         stream_json = self._download_webpage(streams_url, full_title,
2587                                              u'Downloading stream definitions',
2588                                              u'unable to download stream definitions')
2589
2590         streams = json.loads(stream_json)
2591         mediaURL = streams['http_mp3_128_url']
2592         upload_date = unified_strdate(info['created_at'])
2593
2594         return [{
2595             'id':       info['id'],
2596             'url':      mediaURL,
2597             'uploader': info['user']['username'],
2598             'upload_date': upload_date,
2599             'title':    info['title'],
2600             'ext':      u'mp3',
2601             'description': info['description'],
2602         }]
2603
2604 class SoundcloudSetIE(InfoExtractor):
2605     """Information extractor for soundcloud.com sets
2606        To access the media, the uid of the song and a stream token
2607        must be extracted from the page source and the script must make
2608        a request to media.soundcloud.com/crossdomain.xml. Then
2609        the media can be grabbed by requesting from an url composed
2610        of the stream token and uid
2611      """
2612
2613     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2614     IE_NAME = u'soundcloud:set'
2615
2616     def report_resolve(self, video_id):
2617         """Report information extraction."""
2618         self.to_screen(u'%s: Resolving id' % video_id)
2619
2620     def _real_extract(self, url):
2621         mobj = re.match(self._VALID_URL, url)
2622         if mobj is None:
2623             self._downloader.report_error(u'invalid URL: %s' % url)
2624             return
2625
2626         # extract uploader (which is in the url)
2627         uploader = mobj.group(1)
2628         # extract simple title (uploader + slug of song title)
2629         slug_title =  mobj.group(2)
2630         simple_title = uploader + u'-' + slug_title
2631         full_title = '%s/sets/%s' % (uploader, slug_title)
2632
2633         self.report_resolve(full_title)
2634
2635         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2636         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2637         info_json = self._download_webpage(resolv_url, full_title)
2638
2639         videos = []
2640         info = json.loads(info_json)
2641         if 'errors' in info:
2642             for err in info['errors']:
2643                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2644             return
2645
2646         self.report_extraction(full_title)
2647         for track in info['tracks']:
2648             video_id = track['id']
2649
2650             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2651             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2652
2653             self.report_extraction(video_id)
2654             streams = json.loads(stream_json)
2655             mediaURL = streams['http_mp3_128_url']
2656
2657             videos.append({
2658                 'id':       video_id,
2659                 'url':      mediaURL,
2660                 'uploader': track['user']['username'],
2661                 'upload_date':  unified_strdate(track['created_at']),
2662                 'title':    track['title'],
2663                 'ext':      u'mp3',
2664                 'description': track['description'],
2665             })
2666         return videos
2667
2668
2669 class InfoQIE(InfoExtractor):
2670     """Information extractor for infoq.com"""
2671     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2672
2673     def _real_extract(self, url):
2674         mobj = re.match(self._VALID_URL, url)
2675         if mobj is None:
2676             self._downloader.report_error(u'invalid URL: %s' % url)
2677             return
2678
2679         webpage = self._download_webpage(url, video_id=url)
2680         self.report_extraction(url)
2681
2682         # Extract video URL
2683         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2684         if mobj is None:
2685             self._downloader.report_error(u'unable to extract video url')
2686             return
2687         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2688         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2689
2690         # Extract title
2691         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2692         if mobj is None:
2693             self._downloader.report_error(u'unable to extract video title')
2694             return
2695         video_title = mobj.group(1)
2696
2697         # Extract description
2698         video_description = u'No description available.'
2699         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2700         if mobj is not None:
2701             video_description = mobj.group(1)
2702
2703         video_filename = video_url.split('/')[-1]
2704         video_id, extension = video_filename.split('.')
2705
2706         info = {
2707             'id': video_id,
2708             'url': video_url,
2709             'uploader': None,
2710             'upload_date': None,
2711             'title': video_title,
2712             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2713             'thumbnail': None,
2714             'description': video_description,
2715         }
2716
2717         return [info]
2718
2719 class MixcloudIE(InfoExtractor):
2720     """Information extractor for www.mixcloud.com"""
2721
2722     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2723     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2724     IE_NAME = u'mixcloud'
2725
2726     def report_download_json(self, file_id):
2727         """Report JSON download."""
2728         self.to_screen(u'Downloading json')
2729
2730     def get_urls(self, jsonData, fmt, bitrate='best'):
2731         """Get urls from 'audio_formats' section in json"""
2732         file_url = None
2733         try:
2734             bitrate_list = jsonData[fmt]
2735             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2736                 bitrate = max(bitrate_list) # select highest
2737
2738             url_list = jsonData[fmt][bitrate]
2739         except TypeError: # we have no bitrate info.
2740             url_list = jsonData[fmt]
2741         return url_list
2742
2743     def check_urls(self, url_list):
2744         """Returns 1st active url from list"""
2745         for url in url_list:
2746             try:
2747                 compat_urllib_request.urlopen(url)
2748                 return url
2749             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2750                 url = None
2751
2752         return None
2753
2754     def _print_formats(self, formats):
2755         print('Available formats:')
2756         for fmt in formats.keys():
2757             for b in formats[fmt]:
2758                 try:
2759                     ext = formats[fmt][b][0]
2760                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2761                 except TypeError: # we have no bitrate info
2762                     ext = formats[fmt][0]
2763                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2764                     break
2765
2766     def _real_extract(self, url):
2767         mobj = re.match(self._VALID_URL, url)
2768         if mobj is None:
2769             self._downloader.report_error(u'invalid URL: %s' % url)
2770             return
2771         # extract uploader & filename from url
2772         uploader = mobj.group(1).decode('utf-8')
2773         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2774
2775         # construct API request
2776         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2777         # retrieve .json file with links to files
2778         request = compat_urllib_request.Request(file_url)
2779         try:
2780             self.report_download_json(file_url)
2781             jsonData = compat_urllib_request.urlopen(request).read()
2782         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2783             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2784             return
2785
2786         # parse JSON
2787         json_data = json.loads(jsonData)
2788         player_url = json_data['player_swf_url']
2789         formats = dict(json_data['audio_formats'])
2790
2791         req_format = self._downloader.params.get('format', None)
2792         bitrate = None
2793
2794         if self._downloader.params.get('listformats', None):
2795             self._print_formats(formats)
2796             return
2797
2798         if req_format is None or req_format == 'best':
2799             for format_param in formats.keys():
2800                 url_list = self.get_urls(formats, format_param)
2801                 # check urls
2802                 file_url = self.check_urls(url_list)
2803                 if file_url is not None:
2804                     break # got it!
2805         else:
2806             if req_format not in formats:
2807                 self._downloader.report_error(u'format is not available')
2808                 return
2809
2810             url_list = self.get_urls(formats, req_format)
2811             file_url = self.check_urls(url_list)
2812             format_param = req_format
2813
2814         return [{
2815             'id': file_id.decode('utf-8'),
2816             'url': file_url.decode('utf-8'),
2817             'uploader': uploader.decode('utf-8'),
2818             'upload_date': None,
2819             'title': json_data['name'],
2820             'ext': file_url.split('.')[-1].decode('utf-8'),
2821             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2822             'thumbnail': json_data['thumbnail_url'],
2823             'description': json_data['description'],
2824             'player_url': player_url.decode('utf-8'),
2825         }]
2826
2827 class StanfordOpenClassroomIE(InfoExtractor):
2828     """Information extractor for Stanford's Open ClassRoom"""
2829
2830     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2831     IE_NAME = u'stanfordoc'
2832
2833     def _real_extract(self, url):
2834         mobj = re.match(self._VALID_URL, url)
2835         if mobj is None:
2836             raise ExtractorError(u'Invalid URL: %s' % url)
2837
2838         if mobj.group('course') and mobj.group('video'): # A specific video
2839             course = mobj.group('course')
2840             video = mobj.group('video')
2841             info = {
2842                 'id': course + '_' + video,
2843                 'uploader': None,
2844                 'upload_date': None,
2845             }
2846
2847             self.report_extraction(info['id'])
2848             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2849             xmlUrl = baseUrl + video + '.xml'
2850             try:
2851                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2852             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2853                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2854                 return
2855             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2856             try:
2857                 info['title'] = mdoc.findall('./title')[0].text
2858                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2859             except IndexError:
2860                 self._downloader.report_error(u'Invalid metadata XML file')
2861                 return
2862             info['ext'] = info['url'].rpartition('.')[2]
2863             return [info]
2864         elif mobj.group('course'): # A course page
2865             course = mobj.group('course')
2866             info = {
2867                 'id': course,
2868                 'type': 'playlist',
2869                 'uploader': None,
2870                 'upload_date': None,
2871             }
2872
2873             coursepage = self._download_webpage(url, info['id'],
2874                                         note='Downloading course info page',
2875                                         errnote='Unable to download course info page')
2876
2877             m = re.search('<h1>([^<]+)</h1>', coursepage)
2878             if m:
2879                 info['title'] = unescapeHTML(m.group(1))
2880             else:
2881                 info['title'] = info['id']
2882
2883             m = re.search('<description>([^<]+)</description>', coursepage)
2884             if m:
2885                 info['description'] = unescapeHTML(m.group(1))
2886
2887             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2888             info['list'] = [
2889                 {
2890                     'type': 'reference',
2891                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2892                 }
2893                     for vpage in links]
2894             results = []
2895             for entry in info['list']:
2896                 assert entry['type'] == 'reference'
2897                 results += self.extract(entry['url'])
2898             return results
2899         else: # Root page
2900             info = {
2901                 'id': 'Stanford OpenClassroom',
2902                 'type': 'playlist',
2903                 'uploader': None,
2904                 'upload_date': None,
2905             }
2906
2907             self.report_download_webpage(info['id'])
2908             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2909             try:
2910                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2911             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2912                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
2913                 return
2914
2915             info['title'] = info['id']
2916
2917             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2918             info['list'] = [
2919                 {
2920                     'type': 'reference',
2921                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2922                 }
2923                     for cpage in links]
2924
2925             results = []
2926             for entry in info['list']:
2927                 assert entry['type'] == 'reference'
2928                 results += self.extract(entry['url'])
2929             return results
2930
2931 class MTVIE(InfoExtractor):
2932     """Information extractor for MTV.com"""
2933
2934     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2935     IE_NAME = u'mtv'
2936
2937     def _real_extract(self, url):
2938         mobj = re.match(self._VALID_URL, url)
2939         if mobj is None:
2940             self._downloader.report_error(u'invalid URL: %s' % url)
2941             return
2942         if not mobj.group('proto'):
2943             url = 'http://' + url
2944         video_id = mobj.group('videoid')
2945
2946         webpage = self._download_webpage(url, video_id)
2947
2948         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2949         if mobj is None:
2950             self._downloader.report_error(u'unable to extract song name')
2951             return
2952         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2953         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2954         if mobj is None:
2955             self._downloader.report_error(u'unable to extract performer')
2956             return
2957         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2958         video_title = performer + ' - ' + song_name
2959
2960         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2961         if mobj is None:
2962             self._downloader.report_error(u'unable to mtvn_uri')
2963             return
2964         mtvn_uri = mobj.group(1)
2965
2966         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2967         if mobj is None:
2968             self._downloader.report_error(u'unable to extract content id')
2969             return
2970         content_id = mobj.group(1)
2971
2972         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2973         self.report_extraction(video_id)
2974         request = compat_urllib_request.Request(videogen_url)
2975         try:
2976             metadataXml = compat_urllib_request.urlopen(request).read()
2977         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2978             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
2979             return
2980
2981         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2982         renditions = mdoc.findall('.//rendition')
2983
2984         # For now, always pick the highest quality.
2985         rendition = renditions[-1]
2986
2987         try:
2988             _,_,ext = rendition.attrib['type'].partition('/')
2989             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2990             video_url = rendition.find('./src').text
2991         except KeyError:
2992             self._downloader.report_error('Invalid rendition field.')
2993             return
2994
2995         info = {
2996             'id': video_id,
2997             'url': video_url,
2998             'uploader': performer,
2999             'upload_date': None,
3000             'title': video_title,
3001             'ext': ext,
3002             'format': format,
3003         }
3004
3005         return [info]
3006
3007
3008 class YoukuIE(InfoExtractor):
3009     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3010
3011     def _gen_sid(self):
3012         nowTime = int(time.time() * 1000)
3013         random1 = random.randint(1000,1998)
3014         random2 = random.randint(1000,9999)
3015
3016         return "%d%d%d" %(nowTime,random1,random2)
3017
3018     def _get_file_ID_mix_string(self, seed):
3019         mixed = []
3020         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3021         seed = float(seed)
3022         for i in range(len(source)):
3023             seed  =  (seed * 211 + 30031 ) % 65536
3024             index  =  math.floor(seed / 65536 * len(source) )
3025             mixed.append(source[int(index)])
3026             source.remove(source[int(index)])
3027         #return ''.join(mixed)
3028         return mixed
3029
3030     def _get_file_id(self, fileId, seed):
3031         mixed = self._get_file_ID_mix_string(seed)
3032         ids = fileId.split('*')
3033         realId = []
3034         for ch in ids:
3035             if ch:
3036                 realId.append(mixed[int(ch)])
3037         return ''.join(realId)
3038
3039     def _real_extract(self, url):
3040         mobj = re.match(self._VALID_URL, url)
3041         if mobj is None:
3042             self._downloader.report_error(u'invalid URL: %s' % url)
3043             return
3044         video_id = mobj.group('ID')
3045
3046         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3047
3048         jsondata = self._download_webpage(info_url, video_id)
3049
3050         self.report_extraction(video_id)
3051         try:
3052             config = json.loads(jsondata)
3053
3054             video_title =  config['data'][0]['title']
3055             seed = config['data'][0]['seed']
3056
3057             format = self._downloader.params.get('format', None)
3058             supported_format = list(config['data'][0]['streamfileids'].keys())
3059
3060             if format is None or format == 'best':
3061                 if 'hd2' in supported_format:
3062                     format = 'hd2'
3063                 else:
3064                     format = 'flv'
3065                 ext = u'flv'
3066             elif format == 'worst':
3067                 format = 'mp4'
3068                 ext = u'mp4'
3069             else:
3070                 format = 'flv'
3071                 ext = u'flv'
3072
3073
3074             fileid = config['data'][0]['streamfileids'][format]
3075             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3076         except (UnicodeDecodeError, ValueError, KeyError):
3077             self._downloader.report_error(u'unable to extract info section')
3078             return
3079
3080         files_info=[]
3081         sid = self._gen_sid()
3082         fileid = self._get_file_id(fileid, seed)
3083
3084         #column 8,9 of fileid represent the segment number
3085         #fileid[7:9] should be changed
3086         for index, key in enumerate(keys):
3087
3088             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3089             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3090
3091             info = {
3092                 'id': '%s_part%02d' % (video_id, index),
3093                 'url': download_url,
3094                 'uploader': None,
3095                 'upload_date': None,
3096                 'title': video_title,
3097                 'ext': ext,
3098             }
3099             files_info.append(info)
3100
3101         return files_info
3102
3103
3104 class XNXXIE(InfoExtractor):
3105     """Information extractor for xnxx.com"""
3106
3107     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3108     IE_NAME = u'xnxx'
3109     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3110     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3111     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3112
3113     def _real_extract(self, url):
3114         mobj = re.match(self._VALID_URL, url)
3115         if mobj is None:
3116             self._downloader.report_error(u'invalid URL: %s' % url)
3117             return
3118         video_id = mobj.group(1)
3119
3120         # Get webpage content
3121         webpage = self._download_webpage(url, video_id)
3122
3123         result = re.search(self.VIDEO_URL_RE, webpage)
3124         if result is None:
3125             self._downloader.report_error(u'unable to extract video url')
3126             return
3127         video_url = compat_urllib_parse.unquote(result.group(1))
3128
3129         result = re.search(self.VIDEO_TITLE_RE, webpage)
3130         if result is None:
3131             self._downloader.report_error(u'unable to extract video title')
3132             return
3133         video_title = result.group(1)
3134
3135         result = re.search(self.VIDEO_THUMB_RE, webpage)
3136         if result is None:
3137             self._downloader.report_error(u'unable to extract video thumbnail')
3138             return
3139         video_thumbnail = result.group(1)
3140
3141         return [{
3142             'id': video_id,
3143             'url': video_url,
3144             'uploader': None,
3145             'upload_date': None,
3146             'title': video_title,
3147             'ext': 'flv',
3148             'thumbnail': video_thumbnail,
3149             'description': None,
3150         }]
3151
3152
3153 class GooglePlusIE(InfoExtractor):
3154     """Information extractor for plus.google.com."""
3155
3156     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3157     IE_NAME = u'plus.google'
3158
3159     def report_extract_entry(self, url):
3160         """Report downloading extry"""
3161         self.to_screen(u'Downloading entry: %s' % url)
3162
3163     def report_date(self, upload_date):
3164         """Report downloading extry"""
3165         self.to_screen(u'Entry date: %s' % upload_date)
3166
3167     def report_uploader(self, uploader):
3168         """Report downloading extry"""
3169         self.to_screen(u'Uploader: %s' % uploader)
3170
3171     def report_title(self, video_title):
3172         """Report downloading extry"""
3173         self.to_screen(u'Title: %s' % video_title)
3174
3175     def report_extract_vid_page(self, video_page):
3176         """Report information extraction."""
3177         self.to_screen(u'Extracting video page: %s' % video_page)
3178
3179     def _real_extract(self, url):
3180         # Extract id from URL
3181         mobj = re.match(self._VALID_URL, url)
3182         if mobj is None:
3183             self._downloader.report_error(u'Invalid URL: %s' % url)
3184             return
3185
3186         post_url = mobj.group(0)
3187         video_id = mobj.group(1)
3188
3189         video_extension = 'flv'
3190
3191         # Step 1, Retrieve post webpage to extract further information
3192         self.report_extract_entry(post_url)
3193         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3194
3195         # Extract update date
3196         upload_date = None
3197         pattern = 'title="Timestamp">(.*?)</a>'
3198         mobj = re.search(pattern, webpage)
3199         if mobj:
3200             upload_date = mobj.group(1)
3201             # Convert timestring to a format suitable for filename
3202             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3203             upload_date = upload_date.strftime('%Y%m%d')
3204         self.report_date(upload_date)
3205
3206         # Extract uploader
3207         uploader = None
3208         pattern = r'rel\="author".*?>(.*?)</a>'
3209         mobj = re.search(pattern, webpage)
3210         if mobj:
3211             uploader = mobj.group(1)
3212         self.report_uploader(uploader)
3213
3214         # Extract title
3215         # Get the first line for title
3216         video_title = u'NA'
3217         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3218         mobj = re.search(pattern, webpage)
3219         if mobj:
3220             video_title = mobj.group(1)
3221         self.report_title(video_title)
3222
3223         # Step 2, Stimulate clicking the image box to launch video
3224         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3225         mobj = re.search(pattern, webpage)
3226         if mobj is None:
3227             self._downloader.report_error(u'unable to extract video page URL')
3228
3229         video_page = mobj.group(1)
3230         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3231         self.report_extract_vid_page(video_page)
3232
3233
3234         # Extract video links on video page
3235         """Extract video links of all sizes"""
3236         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3237         mobj = re.findall(pattern, webpage)
3238         if len(mobj) == 0:
3239             self._downloader.report_error(u'unable to extract video links')
3240
3241         # Sort in resolution
3242         links = sorted(mobj)
3243
3244         # Choose the lowest of the sort, i.e. highest resolution
3245         video_url = links[-1]
3246         # Only get the url. The resolution part in the tuple has no use anymore
3247         video_url = video_url[-1]
3248         # Treat escaped \u0026 style hex
3249         try:
3250             video_url = video_url.decode("unicode_escape")
3251         except AttributeError: # Python 3
3252             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3253
3254
3255         return [{
3256             'id':       video_id,
3257             'url':      video_url,
3258             'uploader': uploader,
3259             'upload_date':  upload_date,
3260             'title':    video_title,
3261             'ext':      video_extension,
3262         }]
3263
3264 class NBAIE(InfoExtractor):
3265     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3266     IE_NAME = u'nba'
3267
3268     def _real_extract(self, url):
3269         mobj = re.match(self._VALID_URL, url)
3270         if mobj is None:
3271             self._downloader.report_error(u'invalid URL: %s' % url)
3272             return
3273
3274         video_id = mobj.group(1)
3275         if video_id.endswith('/index.html'):
3276             video_id = video_id[:-len('/index.html')]
3277
3278         webpage = self._download_webpage(url, video_id)
3279
3280         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3281         def _findProp(rexp, default=None):
3282             m = re.search(rexp, webpage)
3283             if m:
3284                 return unescapeHTML(m.group(1))
3285             else:
3286                 return default
3287
3288         shortened_video_id = video_id.rpartition('/')[2]
3289         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3290         info = {
3291             'id': shortened_video_id,
3292             'url': video_url,
3293             'ext': 'mp4',
3294             'title': title,
3295             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3296             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3297         }
3298         return [info]
3299
3300 class JustinTVIE(InfoExtractor):
3301     """Information extractor for justin.tv and twitch.tv"""
3302     # TODO: One broadcast may be split into multiple videos. The key
3303     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3304     # starts at 1 and increases. Can we treat all parts as one video?
3305
3306     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3307         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3308     _JUSTIN_PAGE_LIMIT = 100
3309     IE_NAME = u'justin.tv'
3310
3311     def report_download_page(self, channel, offset):
3312         """Report attempt to download a single page of videos."""
3313         self.to_screen(u'%s: Downloading video information from %d to %d' %
3314                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3315
3316     # Return count of items, list of *valid* items
3317     def _parse_page(self, url, video_id):
3318         webpage = self._download_webpage(url, video_id,
3319                                          u'Downloading video info JSON',
3320                                          u'unable to download video info JSON')
3321
3322         response = json.loads(webpage)
3323         if type(response) != list:
3324             error_text = response.get('error', 'unknown error')
3325             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3326             return
3327         info = []
3328         for clip in response:
3329             video_url = clip['video_file_url']
3330             if video_url:
3331                 video_extension = os.path.splitext(video_url)[1][1:]
3332                 video_date = re.sub('-', '', clip['start_time'][:10])
3333                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3334                 video_id = clip['id']
3335                 video_title = clip.get('title', video_id)
3336                 info.append({
3337                     'id': video_id,
3338                     'url': video_url,
3339                     'title': video_title,
3340                     'uploader': clip.get('channel_name', video_uploader_id),
3341                     'uploader_id': video_uploader_id,
3342                     'upload_date': video_date,
3343                     'ext': video_extension,
3344                 })
3345         return (len(response), info)
3346
3347     def _real_extract(self, url):
3348         mobj = re.match(self._VALID_URL, url)
3349         if mobj is None:
3350             self._downloader.report_error(u'invalid URL: %s' % url)
3351             return
3352
3353         api = 'http://api.justin.tv'
3354         video_id = mobj.group(mobj.lastindex)
3355         paged = False
3356         if mobj.lastindex == 1:
3357             paged = True
3358             api += '/channel/archives/%s.json'
3359         else:
3360             api += '/broadcast/by_archive/%s.json'
3361         api = api % (video_id,)
3362
3363         self.report_extraction(video_id)
3364
3365         info = []
3366         offset = 0
3367         limit = self._JUSTIN_PAGE_LIMIT
3368         while True:
3369             if paged:
3370                 self.report_download_page(video_id, offset)
3371             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3372             page_count, page_info = self._parse_page(page_url, video_id)
3373             info.extend(page_info)
3374             if not paged or page_count != limit:
3375                 break
3376             offset += limit
3377         return info
3378
3379 class FunnyOrDieIE(InfoExtractor):
3380     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3381
3382     def _real_extract(self, url):
3383         mobj = re.match(self._VALID_URL, url)
3384         if mobj is None:
3385             self._downloader.report_error(u'invalid URL: %s' % url)
3386             return
3387
3388         video_id = mobj.group('id')
3389         webpage = self._download_webpage(url, video_id)
3390
3391         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3392         if not m:
3393             self._downloader.report_error(u'unable to find video information')
3394         video_url = unescapeHTML(m.group('url'))
3395
3396         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3397         if not m:
3398             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3399             if not m:
3400                 self._downloader.report_error(u'Cannot find video title')
3401         title = clean_html(m.group('title'))
3402
3403         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3404         if m:
3405             desc = unescapeHTML(m.group('desc'))
3406         else:
3407             desc = None
3408
3409         info = {
3410             'id': video_id,
3411             'url': video_url,
3412             'ext': 'mp4',
3413             'title': title,
3414             'description': desc,
3415         }
3416         return [info]
3417
3418 class SteamIE(InfoExtractor):
3419     _VALID_URL = r"""http://store\.steampowered\.com/
3420                 (agecheck/)?
3421                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3422                 (?P<gameID>\d+)/?
3423                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3424                 """
3425
3426     @classmethod
3427     def suitable(cls, url):
3428         """Receives a URL and returns True if suitable for this IE."""
3429         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3430
3431     def _real_extract(self, url):
3432         m = re.match(self._VALID_URL, url, re.VERBOSE)
3433         gameID = m.group('gameID')
3434         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3435         self.report_age_confirmation()
3436         webpage = self._download_webpage(videourl, gameID)
3437         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3438
3439         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3440         mweb = re.finditer(urlRE, webpage)
3441         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3442         titles = re.finditer(namesRE, webpage)
3443         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3444         thumbs = re.finditer(thumbsRE, webpage)
3445         videos = []
3446         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3447             video_id = vid.group('videoID')
3448             title = vtitle.group('videoName')
3449             video_url = vid.group('videoURL')
3450             video_thumb = thumb.group('thumbnail')
3451             if not video_url:
3452                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3453             info = {
3454                 'id':video_id,
3455                 'url':video_url,
3456                 'ext': 'flv',
3457                 'title': unescapeHTML(title),
3458                 'thumbnail': video_thumb
3459                   }
3460             videos.append(info)
3461         return [self.playlist_result(videos, gameID, game_title)]
3462
3463 class UstreamIE(InfoExtractor):
3464     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3465     IE_NAME = u'ustream'
3466
3467     def _real_extract(self, url):
3468         m = re.match(self._VALID_URL, url)
3469         video_id = m.group('videoID')
3470         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3471         webpage = self._download_webpage(url, video_id)
3472         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3473         title = m.group('title')
3474         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3475         uploader = m.group('uploader')
3476         info = {
3477                 'id':video_id,
3478                 'url':video_url,
3479                 'ext': 'flv',
3480                 'title': title,
3481                 'uploader': uploader
3482                   }
3483         return [info]
3484
3485 class WorldStarHipHopIE(InfoExtractor):
3486     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3487     IE_NAME = u'WorldStarHipHop'
3488
3489     def _real_extract(self, url):
3490         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3491
3492         m = re.match(self._VALID_URL, url)
3493         video_id = m.group('id')
3494
3495         webpage_src = self._download_webpage(url, video_id)
3496
3497         mobj = re.search(_src_url, webpage_src)
3498
3499         if mobj is not None:
3500             video_url = mobj.group(1)
3501             if 'mp4' in video_url:
3502                 ext = 'mp4'
3503             else:
3504                 ext = 'flv'
3505         else:
3506             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3507
3508         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3509
3510         if mobj is None:
3511             raise ExtractorError(u'Cannot determine title')
3512         title = mobj.group(1)
3513
3514         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3515         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3516         if mobj is not None:
3517             thumbnail = mobj.group(1)
3518         else:
3519             _title = r"""candytitles.*>(.*)</span>"""
3520             mobj = re.search(_title, webpage_src)
3521             if mobj is not None:
3522                 title = mobj.group(1)
3523             thumbnail = None
3524
3525         results = [{
3526                     'id': video_id,
3527                     'url' : video_url,
3528                     'title' : title,
3529                     'thumbnail' : thumbnail,
3530                     'ext' : ext,
3531                     }]
3532         return results
3533
3534 class RBMARadioIE(InfoExtractor):
3535     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3536
3537     def _real_extract(self, url):
3538         m = re.match(self._VALID_URL, url)
3539         video_id = m.group('videoID')
3540
3541         webpage = self._download_webpage(url, video_id)
3542         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3543         if not m:
3544             raise ExtractorError(u'Cannot find metadata')
3545         json_data = m.group(1)
3546
3547         try:
3548             data = json.loads(json_data)
3549         except ValueError as e:
3550             raise ExtractorError(u'Invalid JSON: ' + str(e))
3551
3552         video_url = data['akamai_url'] + '&cbr=256'
3553         url_parts = compat_urllib_parse_urlparse(video_url)
3554         video_ext = url_parts.path.rpartition('.')[2]
3555         info = {
3556                 'id': video_id,
3557                 'url': video_url,
3558                 'ext': video_ext,
3559                 'title': data['title'],
3560                 'description': data.get('teaser_text'),
3561                 'location': data.get('country_of_origin'),
3562                 'uploader': data.get('host', {}).get('name'),
3563                 'uploader_id': data.get('host', {}).get('slug'),
3564                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3565                 'duration': data.get('duration'),
3566         }
3567         return [info]
3568
3569
3570 class YouPornIE(InfoExtractor):
3571     """Information extractor for youporn.com."""
3572     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3573
3574     def _print_formats(self, formats):
3575         """Print all available formats"""
3576         print(u'Available formats:')
3577         print(u'ext\t\tformat')
3578         print(u'---------------------------------')
3579         for format in formats:
3580             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3581
3582     def _specific(self, req_format, formats):
3583         for x in formats:
3584             if(x["format"]==req_format):
3585                 return x
3586         return None
3587
3588     def _real_extract(self, url):
3589         mobj = re.match(self._VALID_URL, url)
3590         if mobj is None:
3591             self._downloader.report_error(u'invalid URL: %s' % url)
3592             return
3593
3594         video_id = mobj.group('videoid')
3595
3596         req = compat_urllib_request.Request(url)
3597         req.add_header('Cookie', 'age_verified=1')
3598         webpage = self._download_webpage(req, video_id)
3599
3600         # Get the video title
3601         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3602         if result is None:
3603             raise ExtractorError(u'Unable to extract video title')
3604         video_title = result.group('title').strip()
3605
3606         # Get the video date
3607         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3608         if result is None:
3609             self._downloader.report_warning(u'unable to extract video date')
3610             upload_date = None
3611         else:
3612             upload_date = unified_strdate(result.group('date').strip())
3613
3614         # Get the video uploader
3615         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3616         if result is None:
3617             self._downloader.report_warning(u'unable to extract uploader')
3618             video_uploader = None
3619         else:
3620             video_uploader = result.group('uploader').strip()
3621             video_uploader = clean_html( video_uploader )
3622
3623         # Get all of the formats available
3624         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3625         result = re.search(DOWNLOAD_LIST_RE, webpage)
3626         if result is None:
3627             raise ExtractorError(u'Unable to extract download list')
3628         download_list_html = result.group('download_list').strip()
3629
3630         # Get all of the links from the page
3631         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3632         links = re.findall(LINK_RE, download_list_html)
3633         if(len(links) == 0):
3634             raise ExtractorError(u'ERROR: no known formats available for video')
3635
3636         self.to_screen(u'Links found: %d' % len(links))
3637
3638         formats = []
3639         for link in links:
3640
3641             # A link looks like this:
3642             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3643             # A path looks like this:
3644             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3645             video_url = unescapeHTML( link )
3646             path = compat_urllib_parse_urlparse( video_url ).path
3647             extension = os.path.splitext( path )[1][1:]
3648             format = path.split('/')[4].split('_')[:2]
3649             size = format[0]
3650             bitrate = format[1]
3651             format = "-".join( format )
3652             title = u'%s-%s-%s' % (video_title, size, bitrate)
3653
3654             formats.append({
3655                 'id': video_id,
3656                 'url': video_url,
3657                 'uploader': video_uploader,
3658                 'upload_date': upload_date,
3659                 'title': title,
3660                 'ext': extension,
3661                 'format': format,
3662                 'thumbnail': None,
3663                 'description': None,
3664                 'player_url': None
3665             })
3666
3667         if self._downloader.params.get('listformats', None):
3668             self._print_formats(formats)
3669             return
3670
3671         req_format = self._downloader.params.get('format', None)
3672         self.to_screen(u'Format: %s' % req_format)
3673
3674         if req_format is None or req_format == 'best':
3675             return [formats[0]]
3676         elif req_format == 'worst':
3677             return [formats[-1]]
3678         elif req_format in ('-1', 'all'):
3679             return formats
3680         else:
3681             format = self._specific( req_format, formats )
3682             if result is None:
3683                 self._downloader.report_error(u'requested format not available')
3684                 return
3685             return [format]
3686
3687
3688
3689 class PornotubeIE(InfoExtractor):
3690     """Information extractor for pornotube.com."""
3691     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3692
3693     def _real_extract(self, url):
3694         mobj = re.match(self._VALID_URL, url)
3695         if mobj is None:
3696             self._downloader.report_error(u'invalid URL: %s' % url)
3697             return
3698
3699         video_id = mobj.group('videoid')
3700         video_title = mobj.group('title')
3701
3702         # Get webpage content
3703         webpage = self._download_webpage(url, video_id)
3704
3705         # Get the video URL
3706         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3707         result = re.search(VIDEO_URL_RE, webpage)
3708         if result is None:
3709             self._downloader.report_error(u'unable to extract video url')
3710             return
3711         video_url = compat_urllib_parse.unquote(result.group('url'))
3712
3713         #Get the uploaded date
3714         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3715         result = re.search(VIDEO_UPLOADED_RE, webpage)
3716         if result is None:
3717             self._downloader.report_error(u'unable to extract video title')
3718             return
3719         upload_date = unified_strdate(result.group('date'))
3720
3721         info = {'id': video_id,
3722                 'url': video_url,
3723                 'uploader': None,
3724                 'upload_date': upload_date,
3725                 'title': video_title,
3726                 'ext': 'flv',
3727                 'format': 'flv'}
3728
3729         return [info]
3730
3731 class YouJizzIE(InfoExtractor):
3732     """Information extractor for youjizz.com."""
3733     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3734
3735     def _real_extract(self, url):
3736         mobj = re.match(self._VALID_URL, url)
3737         if mobj is None:
3738             self._downloader.report_error(u'invalid URL: %s' % url)
3739             return
3740
3741         video_id = mobj.group('videoid')
3742
3743         # Get webpage content
3744         webpage = self._download_webpage(url, video_id)
3745
3746         # Get the video title
3747         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3748         if result is None:
3749             raise ExtractorError(u'ERROR: unable to extract video title')
3750         video_title = result.group('title').strip()
3751
3752         # Get the embed page
3753         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3754         if result is None:
3755             raise ExtractorError(u'ERROR: unable to extract embed page')
3756
3757         embed_page_url = result.group(0).strip()
3758         video_id = result.group('videoid')
3759
3760         webpage = self._download_webpage(embed_page_url, video_id)
3761
3762         # Get the video URL
3763         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3764         if result is None:
3765             raise ExtractorError(u'ERROR: unable to extract video url')
3766         video_url = result.group('source')
3767
3768         info = {'id': video_id,
3769                 'url': video_url,
3770                 'title': video_title,
3771                 'ext': 'flv',
3772                 'format': 'flv',
3773                 'player_url': embed_page_url}
3774
3775         return [info]
3776
3777 class EightTracksIE(InfoExtractor):
3778     IE_NAME = '8tracks'
3779     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3780
3781     def _real_extract(self, url):
3782         mobj = re.match(self._VALID_URL, url)
3783         if mobj is None:
3784             raise ExtractorError(u'Invalid URL: %s' % url)
3785         playlist_id = mobj.group('id')
3786
3787         webpage = self._download_webpage(url, playlist_id)
3788
3789         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3790         if not m:
3791             raise ExtractorError(u'Cannot find trax information')
3792         json_like = m.group(1)
3793         data = json.loads(json_like)
3794
3795         session = str(random.randint(0, 1000000000))
3796         mix_id = data['id']
3797         track_count = data['tracks_count']
3798         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3799         next_url = first_url
3800         res = []
3801         for i in itertools.count():
3802             api_json = self._download_webpage(next_url, playlist_id,
3803                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3804                 errnote=u'Failed to download song information')
3805             api_data = json.loads(api_json)
3806             track_data = api_data[u'set']['track']
3807             info = {
3808                 'id': track_data['id'],
3809                 'url': track_data['track_file_stream_url'],
3810                 'title': track_data['performer'] + u' - ' + track_data['name'],
3811                 'raw_title': track_data['name'],
3812                 'uploader_id': data['user']['login'],
3813                 'ext': 'm4a',
3814             }
3815             res.append(info)
3816             if api_data['set']['at_last_track']:
3817                 break
3818             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3819         return res
3820
3821 class KeekIE(InfoExtractor):
3822     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3823     IE_NAME = u'keek'
3824
3825     def _real_extract(self, url):
3826         m = re.match(self._VALID_URL, url)
3827         video_id = m.group('videoID')
3828         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3829         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3830         webpage = self._download_webpage(url, video_id)
3831         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3832         title = unescapeHTML(m.group('title'))
3833         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3834         uploader = clean_html(m.group('uploader'))
3835         info = {
3836                 'id': video_id,
3837                 'url': video_url,
3838                 'ext': 'mp4',
3839                 'title': title,
3840                 'thumbnail': thumbnail,
3841                 'uploader': uploader
3842         }
3843         return [info]
3844
3845 class TEDIE(InfoExtractor):
3846     _VALID_URL=r'''http://www\.ted\.com/
3847                    (
3848                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3849                         |
3850                         ((?P<type_talk>talks)) # We have a simple talk
3851                    )
3852                    (/lang/(.*?))? # The url may contain the language
3853                    /(?P<name>\w+) # Here goes the name and then ".html"
3854                    '''
3855
3856     @classmethod
3857     def suitable(cls, url):
3858         """Receives a URL and returns True if suitable for this IE."""
3859         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3860
3861     def _real_extract(self, url):
3862         m=re.match(self._VALID_URL, url, re.VERBOSE)
3863         if m.group('type_talk'):
3864             return [self._talk_info(url)]
3865         else :
3866             playlist_id=m.group('playlist_id')
3867             name=m.group('name')
3868             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3869             return [self._playlist_videos_info(url,name,playlist_id)]
3870
3871     def _talk_video_link(self,mediaSlug):
3872         '''Returns the video link for that mediaSlug'''
3873         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3874
3875     def _playlist_videos_info(self,url,name,playlist_id=0):
3876         '''Returns the videos of the playlist'''
3877         video_RE=r'''
3878                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3879                      ([.\s]*?)data-playlist_item_id="(\d+)"
3880                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3881                      '''
3882         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3883         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3884         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3885         m_names=re.finditer(video_name_RE,webpage)
3886
3887         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3888         m_playlist = re.search(playlist_RE, webpage)
3889         playlist_title = m_playlist.group('playlist_title')
3890
3891         playlist_entries = []
3892         for m_video, m_name in zip(m_videos,m_names):
3893             video_id=m_video.group('video_id')
3894             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3895             playlist_entries.append(self.url_result(talk_url, 'TED'))
3896         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3897
3898     def _talk_info(self, url, video_id=0):
3899         """Return the video for the talk in the url"""
3900         m=re.match(self._VALID_URL, url,re.VERBOSE)
3901         videoName=m.group('name')
3902         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3903         # If the url includes the language we get the title translated
3904         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3905         title=re.search(title_RE, webpage).group('title')
3906         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3907                         "id":(?P<videoID>[\d]+).*?
3908                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3909         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3910         thumb_match=re.search(thumb_RE,webpage)
3911         info_match=re.search(info_RE,webpage,re.VERBOSE)
3912         video_id=info_match.group('videoID')
3913         mediaSlug=info_match.group('mediaSlug')
3914         video_url=self._talk_video_link(mediaSlug)
3915         info = {
3916                 'id': video_id,
3917                 'url': video_url,
3918                 'ext': 'mp4',
3919                 'title': title,
3920                 'thumbnail': thumb_match.group('thumbnail')
3921                 }
3922         return info
3923
3924 class MySpassIE(InfoExtractor):
3925     _VALID_URL = r'http://www.myspass.de/.*'
3926
3927     def _real_extract(self, url):
3928         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3929
3930         # video id is the last path element of the URL
3931         # usually there is a trailing slash, so also try the second but last
3932         url_path = compat_urllib_parse_urlparse(url).path
3933         url_parent_path, video_id = os.path.split(url_path)
3934         if not video_id:
3935             _, video_id = os.path.split(url_parent_path)
3936
3937         # get metadata
3938         metadata_url = META_DATA_URL_TEMPLATE % video_id
3939         metadata_text = self._download_webpage(metadata_url, video_id)
3940         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3941
3942         # extract values from metadata
3943         url_flv_el = metadata.find('url_flv')
3944         if url_flv_el is None:
3945             self._downloader.report_error(u'unable to extract download url')
3946             return
3947         video_url = url_flv_el.text
3948         extension = os.path.splitext(video_url)[1][1:]
3949         title_el = metadata.find('title')
3950         if title_el is None:
3951             self._downloader.report_error(u'unable to extract title')
3952             return
3953         title = title_el.text
3954         format_id_el = metadata.find('format_id')
3955         if format_id_el is None:
3956             format = ext
3957         else:
3958             format = format_id_el.text
3959         description_el = metadata.find('description')
3960         if description_el is not None:
3961             description = description_el.text
3962         else:
3963             description = None
3964         imagePreview_el = metadata.find('imagePreview')
3965         if imagePreview_el is not None:
3966             thumbnail = imagePreview_el.text
3967         else:
3968             thumbnail = None
3969         info = {
3970             'id': video_id,
3971             'url': video_url,
3972             'title': title,
3973             'ext': extension,
3974             'format': format,
3975             'thumbnail': thumbnail,
3976             'description': description
3977         }
3978         return [info]
3979
3980 class SpiegelIE(InfoExtractor):
3981     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3982
3983     def _real_extract(self, url):
3984         m = re.match(self._VALID_URL, url)
3985         video_id = m.group('videoID')
3986
3987         webpage = self._download_webpage(url, video_id)
3988         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3989         if not m:
3990             raise ExtractorError(u'Cannot find title')
3991         video_title = unescapeHTML(m.group(1))
3992
3993         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3994         xml_code = self._download_webpage(xml_url, video_id,
3995                     note=u'Downloading XML', errnote=u'Failed to download XML')
3996
3997         idoc = xml.etree.ElementTree.fromstring(xml_code)
3998         last_type = idoc[-1]
3999         filename = last_type.findall('./filename')[0].text
4000         duration = float(last_type.findall('./duration')[0].text)
4001
4002         video_url = 'http://video2.spiegel.de/flash/' + filename
4003         video_ext = filename.rpartition('.')[2]
4004         info = {
4005             'id': video_id,
4006             'url': video_url,
4007             'ext': video_ext,
4008             'title': video_title,
4009             'duration': duration,
4010         }
4011         return [info]
4012
4013 class LiveLeakIE(InfoExtractor):
4014
4015     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4016     IE_NAME = u'liveleak'
4017
4018     def _real_extract(self, url):
4019         mobj = re.match(self._VALID_URL, url)
4020         if mobj is None:
4021             self._downloader.report_error(u'invalid URL: %s' % url)
4022             return
4023
4024         video_id = mobj.group('video_id')
4025
4026         webpage = self._download_webpage(url, video_id)
4027
4028         m = re.search(r'file: "(.*?)",', webpage)
4029         if not m:
4030             self._downloader.report_error(u'unable to find video url')
4031             return
4032         video_url = m.group(1)
4033
4034         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4035         if not m:
4036             self._downloader.report_error(u'Cannot find video title')
4037         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4038
4039         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4040         if m:
4041             desc = unescapeHTML(m.group('desc'))
4042         else:
4043             desc = None
4044
4045         m = re.search(r'By:.*?(\w+)</a>', webpage)
4046         if m:
4047             uploader = clean_html(m.group(1))
4048         else:
4049             uploader = None
4050
4051         info = {
4052             'id':  video_id,
4053             'url': video_url,
4054             'ext': 'mp4',
4055             'title': title,
4056             'description': desc,
4057             'uploader': uploader
4058         }
4059
4060         return [info]
4061
4062 class ARDIE(InfoExtractor):
4063     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4064     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4065     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4066
4067     def _real_extract(self, url):
4068         # determine video id from url
4069         m = re.match(self._VALID_URL, url)
4070
4071         numid = re.search(r'documentId=([0-9]+)', url)
4072         if numid:
4073             video_id = numid.group(1)
4074         else:
4075             video_id = m.group('video_id')
4076
4077         # determine title and media streams from webpage
4078         html = self._download_webpage(url, video_id)
4079         title = re.search(self._TITLE, html).group('title')
4080         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4081         if not streams:
4082             assert '"fsk"' in html
4083             self._downloader.report_error(u'this video is only available after 8:00 pm')
4084             return
4085
4086         # choose default media type and highest quality for now
4087         stream = max([s for s in streams if int(s["media_type"]) == 0],
4088                      key=lambda s: int(s["quality"]))
4089
4090         # there's two possibilities: RTMP stream or HTTP download
4091         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4092         if stream['rtmp_url']:
4093             self.to_screen(u'RTMP download detected')
4094             assert stream['video_url'].startswith('mp4:')
4095             info["url"] = stream["rtmp_url"]
4096             info["play_path"] = stream['video_url']
4097         else:
4098             assert stream["video_url"].endswith('.mp4')
4099             info["url"] = stream["video_url"]
4100         return [info]
4101
4102 class TumblrIE(InfoExtractor):
4103     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4104
4105     def _real_extract(self, url):
4106         m_url = re.match(self._VALID_URL, url)
4107         video_id = m_url.group('id')
4108         blog = m_url.group('blog_name')
4109
4110         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4111         webpage = self._download_webpage(url, video_id)
4112
4113         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4114         video = re.search(re_video, webpage)
4115         if video is None:
4116             self.to_screen("No video founded")
4117             return []
4118         video_url = video.group('video_url')
4119         ext = video.group('ext')
4120
4121         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4122         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4123
4124         # The only place where you can get a title, it's not complete,
4125         # but searching in other places doesn't work for all videos
4126         re_title = r'<title>(?P<title>.*?)</title>'
4127         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4128
4129         return [{'id': video_id,
4130                  'url': video_url,
4131                  'title': title,
4132                  'thumbnail': thumb,
4133                  'ext': ext
4134                  }]
4135
4136 class BandcampIE(InfoExtractor):
4137     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4138
4139     def _real_extract(self, url):
4140         mobj = re.match(self._VALID_URL, url)
4141         title = mobj.group('title')
4142         webpage = self._download_webpage(url, title)
4143         # We get the link to the free download page
4144         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4145         if m_download is None:
4146             self._downloader.report_error('No free songs founded')
4147             return
4148         download_link = m_download.group(1)
4149         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4150                        webpage, re.MULTILINE|re.DOTALL).group('id')
4151
4152         download_webpage = self._download_webpage(download_link, id,
4153                                                   'Downloading free downloads page')
4154         # We get the dictionary of the track from some javascrip code
4155         info = re.search(r'items: (.*?),$',
4156                          download_webpage, re.MULTILINE).group(1)
4157         info = json.loads(info)[0]
4158         # We pick mp3-320 for now, until format selection can be easily implemented.
4159         mp3_info = info[u'downloads'][u'mp3-320']
4160         # If we try to use this url it says the link has expired
4161         initial_url = mp3_info[u'url']
4162         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4163         m_url = re.match(re_url, initial_url)
4164         #We build the url we will use to get the final track url
4165         # This url is build in Bandcamp in the script download_bunde_*.js
4166         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4167         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4168         # If we could correctly generate the .rand field the url would be
4169         #in the "download_url" key
4170         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4171
4172         track_info = {'id':id,
4173                       'title' : info[u'title'],
4174                       'ext' : 'mp3',
4175                       'url' : final_url,
4176                       'thumbnail' : info[u'thumb_url'],
4177                       'uploader' : info[u'artist']
4178                       }
4179
4180         return [track_info]
4181
4182 class RedTubeIE(InfoExtractor):
4183     """Information Extractor for redtube"""
4184     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4185
4186     def _real_extract(self,url):
4187         mobj = re.match(self._VALID_URL, url)
4188         if mobj is None:
4189             raise ExtractorError(u'Invalid URL: %s' % url)
4190
4191         video_id = mobj.group('id')
4192         video_extension = 'mp4'
4193         webpage = self._download_webpage(url, video_id)
4194         self.report_extraction(video_id)
4195         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4196
4197         if mobj is None:
4198             raise ExtractorError(u'Unable to extract media URL')
4199
4200         video_url = mobj.group(1)
4201         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4202         if mobj is None:
4203             raise ExtractorError(u'Unable to extract title')
4204         video_title = mobj.group(1)
4205
4206         return [{
4207             'id':       video_id,
4208             'url':      video_url,
4209             'ext':      video_extension,
4210             'title':    video_title,
4211         }]
4212
4213
4214 def gen_extractors():
4215     """ Return a list of an instance of every supported extractor.
4216     The order does matter; the first extractor matched is the one handling the URL.
4217     """
4218     return [
4219         YoutubePlaylistIE(),
4220         YoutubeChannelIE(),
4221         YoutubeUserIE(),
4222         YoutubeSearchIE(),
4223         YoutubeIE(),
4224         MetacafeIE(),
4225         DailymotionIE(),
4226         GoogleSearchIE(),
4227         PhotobucketIE(),
4228         YahooIE(),
4229         YahooSearchIE(),
4230         DepositFilesIE(),
4231         FacebookIE(),
4232         BlipTVUserIE(),
4233         BlipTVIE(),
4234         VimeoIE(),
4235         MyVideoIE(),
4236         ComedyCentralIE(),
4237         EscapistIE(),
4238         CollegeHumorIE(),
4239         XVideosIE(),
4240         SoundcloudSetIE(),
4241         SoundcloudIE(),
4242         InfoQIE(),
4243         MixcloudIE(),
4244         StanfordOpenClassroomIE(),
4245         MTVIE(),
4246         YoukuIE(),
4247         XNXXIE(),
4248         YouJizzIE(),
4249         PornotubeIE(),
4250         YouPornIE(),
4251         GooglePlusIE(),
4252         ArteTvIE(),
4253         NBAIE(),
4254         WorldStarHipHopIE(),
4255         JustinTVIE(),
4256         FunnyOrDieIE(),
4257         SteamIE(),
4258         UstreamIE(),
4259         RBMARadioIE(),
4260         EightTracksIE(),
4261         KeekIE(),
4262         TEDIE(),
4263         MySpassIE(),
4264         SpiegelIE(),
4265         LiveLeakIE(),
4266         ARDIE(),
4267         TumblrIE(),
4268         BandcampIE(),
4269         RedTubeIE(),
4270         GenericIE()
4271     ]
4272
4273 def get_info_extractor(ie_name):
4274     """Returns the info extractor class with the given ie_name"""
4275     return globals()[ie_name+'IE']