youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             self.report_download_webpage(video_id)
 118         elif note is not False:
 119             self.to_screen(u'%s: %s' % (video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns a tuple (page content as string, URL handle) """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         content = webpage_bytes.decode(encoding, 'replace')
 146         return (content, urlh)
 147
 148     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 149         """ Returns the data of the page as a string """
 150         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 151
 152     def to_screen(self, msg):
 153         """Print msg to screen, prefixing it with '[ie_name]'"""
 154         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 155
 156     def report_extraction(self, id_or_name):
 157         """Report information extraction."""
 158         self.to_screen(u'%s: Extracting information' % id_or_name)
 159
 160     def report_download_webpage(self, video_id):
 161         """Report webpage download."""
 162         self.to_screen(u'%s: Downloading webpage' % video_id)
 163
 164     def report_age_confirmation(self):
 165         """Report attempt to confirm age."""
 166         self.to_screen(u'Confirming age')
 167
 168     #Methods for following #608
 169     #They set the correct value of the '_type' key
 170     def video_result(self, video_info):
 171         """Returns a video"""
 172         video_info['_type'] = 'video'
 173         return video_info
 174     def url_result(self, url, ie=None):
 175         """Returns a url that points to a page that should be processed"""
 176         #TODO: ie should be the class used for getting the info
 177         video_info = {'_type': 'url',
 178                       'url': url,
 179                       'ie_key': ie}
 180         return video_info
 181     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 182         """Returns a playlist"""
 183         video_info = {'_type': 'playlist',
 184                       'entries': entries}
 185         if playlist_id:
 186             video_info['id'] = playlist_id
 187         if playlist_title:
 188             video_info['title'] = playlist_title
 189         return video_info
 190
 191
 192 class YoutubeIE(InfoExtractor):
 193     """Information extractor for youtube.com."""
 194
 195     _VALID_URL = r"""^
 196                      (
 197                          (?:https?://)?                                       # http(s):// (optional)
 198                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 199                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 200                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 201                          (?:                                                  # the various things that can precede the ID:
 202                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 203                              |(?:                                             # or the v= param in all its forms
 204                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 205                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 206                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 207                                  v=
 208                              )
 209                          )?                                                   # optional -> youtube.com/xxxx is OK
 210                      )?                                                       # all until now is optional -> you can pass the naked ID
 211                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 212                      (?(1).+)?                                                # if we found the ID, everything can follow
 213                      $"""
 214     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 215     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 216     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 217     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 218     _NETRC_MACHINE = 'youtube'
 219     # Listed in order of quality
 220     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 221     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 222     _video_extensions = {
 223         '13': '3gp',
 224         '17': 'mp4',
 225         '18': 'mp4',
 226         '22': 'mp4',
 227         '37': 'mp4',
 228         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 229         '43': 'webm',
 230         '44': 'webm',
 231         '45': 'webm',
 232         '46': 'webm',
 233     }
 234     _video_dimensions = {
 235         '5': '240x400',
 236         '6': '???',
 237         '13': '???',
 238         '17': '144x176',
 239         '18': '360x640',
 240         '22': '720x1280',
 241         '34': '360x640',
 242         '35': '480x854',
 243         '37': '1080x1920',
 244         '38': '3072x4096',
 245         '43': '360x640',
 246         '44': '480x854',
 247         '45': '720x1280',
 248         '46': '1080x1920',
 249     }
 250     IE_NAME = u'youtube'
 251
 252     @classmethod
 253     def suitable(cls, url):
 254         """Receives a URL and returns True if suitable for this IE."""
 255         if YoutubePlaylistIE.suitable(url): return False
 256         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 257
 258     def report_lang(self):
 259         """Report attempt to set language."""
 260         self.to_screen(u'Setting language')
 261
 262     def report_login(self):
 263         """Report attempt to log in."""
 264         self.to_screen(u'Logging in')
 265
 266     def report_video_webpage_download(self, video_id):
 267         """Report attempt to download video webpage."""
 268         self.to_screen(u'%s: Downloading video webpage' % video_id)
 269
 270     def report_video_info_webpage_download(self, video_id):
 271         """Report attempt to download video info webpage."""
 272         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 273
 274     def report_video_subtitles_download(self, video_id):
 275         """Report attempt to download video info webpage."""
 276         self.to_screen(u'%s: Checking available subtitles' % video_id)
 277
 278     def report_video_subtitles_request(self, video_id, sub_lang, format):
 279         """Report attempt to download video info webpage."""
 280         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 281
 282     def report_video_subtitles_available(self, video_id, sub_lang_list):
 283         """Report available subtitles."""
 284         sub_lang = ",".join(list(sub_lang_list.keys()))
 285         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 286
 287     def report_information_extraction(self, video_id):
 288         """Report attempt to extract video information."""
 289         self.to_screen(u'%s: Extracting video information' % video_id)
 290
 291     def report_unavailable_format(self, video_id, format):
 292         """Report extracted video URL."""
 293         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 294
 295     def report_rtmp_download(self):
 296         """Indicate the download will use the RTMP protocol."""
 297         self.to_screen(u'RTMP download detected')
 298
 299     def _get_available_subtitles(self, video_id):
 300         self.report_video_subtitles_download(video_id)
 301         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 302         try:
 303             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 304         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 305             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 306         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 307         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 308         if not sub_lang_list:
 309             return (u'video doesn\'t have subtitles', None)
 310         return sub_lang_list
 311
 312     def _list_available_subtitles(self, video_id):
 313         sub_lang_list = self._get_available_subtitles(video_id)
 314         self.report_video_subtitles_available(video_id, sub_lang_list)
 315
 316     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 317         """
 318         Return tuple:
 319         (error_message, sub_lang, sub)
 320         """
 321         self.report_video_subtitles_request(video_id, sub_lang, format)
 322         params = compat_urllib_parse.urlencode({
 323             'lang': sub_lang,
 324             'name': sub_name,
 325             'v': video_id,
 326             'fmt': format,
 327         })
 328         url = 'http://www.youtube.com/api/timedtext?' + params
 329         try:
 330             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 331         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 332             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 333         if not sub:
 334             return (u'Did not fetch video subtitles', None, None)
 335         return (None, sub_lang, sub)
 336
 337     def _extract_subtitle(self, video_id):
 338         """
 339         Return a list with a tuple:
 340         [(error_message, sub_lang, sub)]
 341         """
 342         sub_lang_list = self._get_available_subtitles(video_id)
 343         sub_format = self._downloader.params.get('subtitlesformat')
 344         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 345             return [(sub_lang_list[0], None, None)]
 346         if self._downloader.params.get('subtitleslang', False):
 347             sub_lang = self._downloader.params.get('subtitleslang')
 348         elif 'en' in sub_lang_list:
 349             sub_lang = 'en'
 350         else:
 351             sub_lang = list(sub_lang_list.keys())[0]
 352         if not sub_lang in sub_lang_list:
 353             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 354
 355         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 356         return [subtitle]
 357
 358     def _extract_all_subtitles(self, video_id):
 359         sub_lang_list = self._get_available_subtitles(video_id)
 360         sub_format = self._downloader.params.get('subtitlesformat')
 361         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 362             return [(sub_lang_list[0], None, None)]
 363         subtitles = []
 364         for sub_lang in sub_lang_list:
 365             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 366             subtitles.append(subtitle)
 367         return subtitles
 368
 369     def _print_formats(self, formats):
 370         print('Available formats:')
 371         for x in formats:
 372             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 373
 374     def _real_initialize(self):
 375         if self._downloader is None:
 376             return
 377
 378         username = None
 379         password = None
 380         downloader_params = self._downloader.params
 381
 382         # Attempt to use provided username and password or .netrc data
 383         if downloader_params.get('username', None) is not None:
 384             username = downloader_params['username']
 385             password = downloader_params['password']
 386         elif downloader_params.get('usenetrc', False):
 387             try:
 388                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 389                 if info is not None:
 390                     username = info[0]
 391                     password = info[2]
 392                 else:
 393                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 394             except (IOError, netrc.NetrcParseError) as err:
 395                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 396                 return
 397
 398         # Set language
 399         request = compat_urllib_request.Request(self._LANG_URL)
 400         try:
 401             self.report_lang()
 402             compat_urllib_request.urlopen(request).read()
 403         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 404             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 405             return
 406
 407         # No authentication to be performed
 408         if username is None:
 409             return
 410
 411         request = compat_urllib_request.Request(self._LOGIN_URL)
 412         try:
 413             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 416             return
 417
 418         galx = None
 419         dsh = None
 420         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 421         if match:
 422           galx = match.group(1)
 423
 424         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 425         if match:
 426           dsh = match.group(1)
 427
 428         # Log in
 429         login_form_strs = {
 430                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 431                 u'Email': username,
 432                 u'GALX': galx,
 433                 u'Passwd': password,
 434                 u'PersistentCookie': u'yes',
 435                 u'_utf8': u'霱',
 436                 u'bgresponse': u'js_disabled',
 437                 u'checkConnection': u'',
 438                 u'checkedDomains': u'youtube',
 439                 u'dnConn': u'',
 440                 u'dsh': dsh,
 441                 u'pstMsg': u'0',
 442                 u'rmShown': u'1',
 443                 u'secTok': u'',
 444                 u'signIn': u'Sign in',
 445                 u'timeStmp': u'',
 446                 u'service': u'youtube',
 447                 u'uilel': u'3',
 448                 u'hl': u'en_US',
 449         }
 450         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 451         # chokes on unicode
 452         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 453         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 454         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 455         try:
 456             self.report_login()
 457             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 458             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 459                 self._downloader.report_warning(u'unable to log in: bad username or password')
 460                 return
 461         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 462             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 463             return
 464
 465         # Confirm age
 466         age_form = {
 467                 'next_url':     '/',
 468                 'action_confirm':   'Confirm',
 469                 }
 470         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 471         try:
 472             self.report_age_confirmation()
 473             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 474         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 475             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 476
 477     def _extract_id(self, url):
 478         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 479         if mobj is None:
 480             raise ExtractorError(u'Invalid URL: %s' % url)
 481         video_id = mobj.group(2)
 482         return video_id
 483
 484     def _real_extract(self, url):
 485         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 486         mobj = re.search(self._NEXT_URL_RE, url)
 487         if mobj:
 488             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 489         video_id = self._extract_id(url)
 490
 491         # Get video webpage
 492         self.report_video_webpage_download(video_id)
 493         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 494         request = compat_urllib_request.Request(url)
 495         try:
 496             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 497         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 498             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 499
 500         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 501
 502         # Attempt to extract SWF player URL
 503         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 504         if mobj is not None:
 505             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 506         else:
 507             player_url = None
 508
 509         # Get video info
 510         self.report_video_info_webpage_download(video_id)
 511         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 512             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 513                     % (video_id, el_type))
 514             video_info_webpage = self._download_webpage(video_info_url, video_id,
 515                                     note=False,
 516                                     errnote='unable to download video info webpage')
 517             video_info = compat_parse_qs(video_info_webpage)
 518             if 'token' in video_info:
 519                 break
 520         if 'token' not in video_info:
 521             if 'reason' in video_info:
 522                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 523             else:
 524                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 525
 526         # Check for "rental" videos
 527         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 528             raise ExtractorError(u'"rental" videos not supported')
 529
 530         # Start extracting information
 531         self.report_information_extraction(video_id)
 532
 533         # uploader
 534         if 'author' not in video_info:
 535             raise ExtractorError(u'Unable to extract uploader name')
 536         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 537
 538         # uploader_id
 539         video_uploader_id = None
 540         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 541         if mobj is not None:
 542             video_uploader_id = mobj.group(1)
 543         else:
 544             self._downloader.report_warning(u'unable to extract uploader nickname')
 545
 546         # title
 547         if 'title' not in video_info:
 548             raise ExtractorError(u'Unable to extract video title')
 549         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 550
 551         # thumbnail image
 552         if 'thumbnail_url' not in video_info:
 553             self._downloader.report_warning(u'unable to extract video thumbnail')
 554             video_thumbnail = ''
 555         else:   # don't panic if we can't find it
 556             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 557
 558         # upload date
 559         upload_date = None
 560         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 561         if mobj is not None:
 562             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 563             upload_date = unified_strdate(upload_date)
 564
 565         # description
 566         video_description = get_element_by_id("eow-description", video_webpage)
 567         if video_description:
 568             video_description = clean_html(video_description)
 569         else:
 570             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 571             if fd_mobj:
 572                 video_description = unescapeHTML(fd_mobj.group(1))
 573             else:
 574                 video_description = u''
 575
 576         # subtitles
 577         video_subtitles = None
 578
 579         if self._downloader.params.get('writesubtitles', False):
 580             video_subtitles = self._extract_subtitle(video_id)
 581             if video_subtitles:
 582                 (sub_error, sub_lang, sub) = video_subtitles[0]
 583                 if sub_error:
 584                     self._downloader.report_error(sub_error)
 585
 586         if self._downloader.params.get('allsubtitles', False):
 587             video_subtitles = self._extract_all_subtitles(video_id)
 588             for video_subtitle in video_subtitles:
 589                 (sub_error, sub_lang, sub) = video_subtitle
 590                 if sub_error:
 591                     self._downloader.report_error(sub_error)
 592
 593         if self._downloader.params.get('listsubtitles', False):
 594             sub_lang_list = self._list_available_subtitles(video_id)
 595             return
 596
 597         if 'length_seconds' not in video_info:
 598             self._downloader.report_warning(u'unable to extract video duration')
 599             video_duration = ''
 600         else:
 601             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 602
 603         # token
 604         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 605
 606         # Decide which formats to download
 607         req_format = self._downloader.params.get('format', None)
 608
 609         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 610             self.report_rtmp_download()
 611             video_url_list = [(None, video_info['conn'][0])]
 612         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 613             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 614             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 615             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 616             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 617
 618             format_limit = self._downloader.params.get('format_limit', None)
 619             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 620             if format_limit is not None and format_limit in available_formats:
 621                 format_list = available_formats[available_formats.index(format_limit):]
 622             else:
 623                 format_list = available_formats
 624             existing_formats = [x for x in format_list if x in url_map]
 625             if len(existing_formats) == 0:
 626                 raise ExtractorError(u'no known formats available for video')
 627             if self._downloader.params.get('listformats', None):
 628                 self._print_formats(existing_formats)
 629                 return
 630             if req_format is None or req_format == 'best':
 631                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 632             elif req_format == 'worst':
 633                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 634             elif req_format in ('-1', 'all'):
 635                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 636             else:
 637                 # Specific formats. We pick the first in a slash-delimeted sequence.
 638                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 639                 req_formats = req_format.split('/')
 640                 video_url_list = None
 641                 for rf in req_formats:
 642                     if rf in url_map:
 643                         video_url_list = [(rf, url_map[rf])]
 644                         break
 645                 if video_url_list is None:
 646                     raise ExtractorError(u'requested format not available')
 647         else:
 648             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 649
 650         results = []
 651         for format_param, video_real_url in video_url_list:
 652             # Extension
 653             video_extension = self._video_extensions.get(format_param, 'flv')
 654
 655             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 656                                               self._video_dimensions.get(format_param, '???'))
 657
 658             results.append({
 659                 'id':       video_id,
 660                 'url':      video_real_url,
 661                 'uploader': video_uploader,
 662                 'uploader_id': video_uploader_id,
 663                 'upload_date':  upload_date,
 664                 'title':    video_title,
 665                 'ext':      video_extension,
 666                 'format':   video_format,
 667                 'thumbnail':    video_thumbnail,
 668                 'description':  video_description,
 669                 'player_url':   player_url,
 670                 'subtitles':    video_subtitles,
 671                 'duration':     video_duration
 672             })
 673         return results
 674
 675
 676 class MetacafeIE(InfoExtractor):
 677     """Information Extractor for metacafe.com."""
 678
 679     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 680     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 681     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 682     IE_NAME = u'metacafe'
 683
 684     def report_disclaimer(self):
 685         """Report disclaimer retrieval."""
 686         self.to_screen(u'Retrieving disclaimer')
 687
 688     def _real_initialize(self):
 689         # Retrieve disclaimer
 690         request = compat_urllib_request.Request(self._DISCLAIMER)
 691         try:
 692             self.report_disclaimer()
 693             disclaimer = compat_urllib_request.urlopen(request).read()
 694         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 695             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 696
 697         # Confirm age
 698         disclaimer_form = {
 699             'filters': '0',
 700             'submit': "Continue - I'm over 18",
 701             }
 702         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 703         try:
 704             self.report_age_confirmation()
 705             disclaimer = compat_urllib_request.urlopen(request).read()
 706         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 707             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 708
 709     def _real_extract(self, url):
 710         # Extract id and simplified title from URL
 711         mobj = re.match(self._VALID_URL, url)
 712         if mobj is None:
 713             raise ExtractorError(u'Invalid URL: %s' % url)
 714
 715         video_id = mobj.group(1)
 716
 717         # Check if video comes from YouTube
 718         mobj2 = re.match(r'^yt-(.*)$', video_id)
 719         if mobj2 is not None:
 720             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 721
 722         # Retrieve video webpage to extract further information
 723         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 724
 725         # Extract URL, uploader and title from webpage
 726         self.report_extraction(video_id)
 727         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 728         if mobj is not None:
 729             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 730             video_extension = mediaURL[-3:]
 731
 732             # Extract gdaKey if available
 733             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 734             if mobj is None:
 735                 video_url = mediaURL
 736             else:
 737                 gdaKey = mobj.group(1)
 738                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 739         else:
 740             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 741             if mobj is None:
 742                 raise ExtractorError(u'Unable to extract media URL')
 743             vardict = compat_parse_qs(mobj.group(1))
 744             if 'mediaData' not in vardict:
 745                 raise ExtractorError(u'Unable to extract media URL')
 746             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 747             if mobj is None:
 748                 raise ExtractorError(u'Unable to extract media URL')
 749             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 750             video_extension = mediaURL[-3:]
 751             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 752
 753         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 754         if mobj is None:
 755             raise ExtractorError(u'Unable to extract title')
 756         video_title = mobj.group(1).decode('utf-8')
 757
 758         mobj = re.search(r'submitter=(.*?);', webpage)
 759         if mobj is None:
 760             raise ExtractorError(u'Unable to extract uploader nickname')
 761         video_uploader = mobj.group(1)
 762
 763         return [{
 764             'id':       video_id.decode('utf-8'),
 765             'url':      video_url.decode('utf-8'),
 766             'uploader': video_uploader.decode('utf-8'),
 767             'upload_date':  None,
 768             'title':    video_title,
 769             'ext':      video_extension.decode('utf-8'),
 770         }]
 771
 772 class DailymotionIE(InfoExtractor):
 773     """Information Extractor for Dailymotion"""
 774
 775     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 776     IE_NAME = u'dailymotion'
 777
 778     def _real_extract(self, url):
 779         # Extract id and simplified title from URL
 780         mobj = re.match(self._VALID_URL, url)
 781         if mobj is None:
 782             raise ExtractorError(u'Invalid URL: %s' % url)
 783
 784         video_id = mobj.group(1).split('_')[0].split('?')[0]
 785
 786         video_extension = 'mp4'
 787
 788         # Retrieve video webpage to extract further information
 789         request = compat_urllib_request.Request(url)
 790         request.add_header('Cookie', 'family_filter=off')
 791         webpage = self._download_webpage(request, video_id)
 792
 793         # Extract URL, uploader and title from webpage
 794         self.report_extraction(video_id)
 795         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 796         if mobj is None:
 797             raise ExtractorError(u'Unable to extract media URL')
 798         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 799
 800         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 801             if key in flashvars:
 802                 max_quality = key
 803                 self.to_screen(u'Using %s' % key)
 804                 break
 805         else:
 806             raise ExtractorError(u'Unable to extract video URL')
 807
 808         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 809         if mobj is None:
 810             raise ExtractorError(u'Unable to extract video URL')
 811
 812         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 813
 814         # TODO: support choosing qualities
 815
 816         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 817         if mobj is None:
 818             raise ExtractorError(u'Unable to extract title')
 819         video_title = unescapeHTML(mobj.group('title'))
 820
 821         video_uploader = None
 822         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 823         if mobj is None:
 824             # lookin for official user
 825             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 826             if mobj_official is None:
 827                 self._downloader.report_warning(u'unable to extract uploader nickname')
 828             else:
 829                 video_uploader = mobj_official.group(1)
 830         else:
 831             video_uploader = mobj.group(1)
 832
 833         video_upload_date = None
 834         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 835         if mobj is not None:
 836             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 837
 838         return [{
 839             'id':       video_id,
 840             'url':      video_url,
 841             'uploader': video_uploader,
 842             'upload_date':  video_upload_date,
 843             'title':    video_title,
 844             'ext':      video_extension,
 845         }]
 846
 847
 848 class PhotobucketIE(InfoExtractor):
 849     """Information extractor for photobucket.com."""
 850
 851     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 852     IE_NAME = u'photobucket'
 853
 854     def _real_extract(self, url):
 855         # Extract id from URL
 856         mobj = re.match(self._VALID_URL, url)
 857         if mobj is None:
 858             raise ExtractorError(u'Invalid URL: %s' % url)
 859
 860         video_id = mobj.group(1)
 861
 862         video_extension = 'flv'
 863
 864         # Retrieve video webpage to extract further information
 865         request = compat_urllib_request.Request(url)
 866         try:
 867             self.report_download_webpage(video_id)
 868             webpage = compat_urllib_request.urlopen(request).read()
 869         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 870             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
 871
 872         # Extract URL, uploader, and title from webpage
 873         self.report_extraction(video_id)
 874         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 875         if mobj is None:
 876             raise ExtractorError(u'Unable to extract media URL')
 877         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 878
 879         video_url = mediaURL
 880
 881         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 882         if mobj is None:
 883             raise ExtractorError(u'Unable to extract title')
 884         video_title = mobj.group(1).decode('utf-8')
 885
 886         video_uploader = mobj.group(2).decode('utf-8')
 887
 888         return [{
 889             'id':       video_id.decode('utf-8'),
 890             'url':      video_url.decode('utf-8'),
 891             'uploader': video_uploader,
 892             'upload_date':  None,
 893             'title':    video_title,
 894             'ext':      video_extension.decode('utf-8'),
 895         }]
 896
 897
 898 class YahooIE(InfoExtractor):
 899     """Information extractor for video.yahoo.com."""
 900
 901     _WORKING = False
 902     # _VALID_URL matches all Yahoo! Video URLs
 903     # _VPAGE_URL matches only the extractable '/watch/' URLs
 904     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 905     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 906     IE_NAME = u'video.yahoo'
 907
 908     def _real_extract(self, url, new_video=True):
 909         # Extract ID from URL
 910         mobj = re.match(self._VALID_URL, url)
 911         if mobj is None:
 912             raise ExtractorError(u'Invalid URL: %s' % url)
 913
 914         video_id = mobj.group(2)
 915         video_extension = 'flv'
 916
 917         # Rewrite valid but non-extractable URLs as
 918         # extractable English language /watch/ URLs
 919         if re.match(self._VPAGE_URL, url) is None:
 920             request = compat_urllib_request.Request(url)
 921             try:
 922                 webpage = compat_urllib_request.urlopen(request).read()
 923             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 924                 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
 925
 926             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 927             if mobj is None:
 928                 raise ExtractorError(u'Unable to extract id field')
 929             yahoo_id = mobj.group(1)
 930
 931             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 932             if mobj is None:
 933                 raise ExtractorError(u'Unable to extract vid field')
 934             yahoo_vid = mobj.group(1)
 935
 936             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 937             return self._real_extract(url, new_video=False)
 938
 939         # Retrieve video webpage to extract further information
 940         request = compat_urllib_request.Request(url)
 941         try:
 942             self.report_download_webpage(video_id)
 943             webpage = compat_urllib_request.urlopen(request).read()
 944         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 945             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
 946
 947         # Extract uploader and title from webpage
 948         self.report_extraction(video_id)
 949         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 950         if mobj is None:
 951             raise ExtractorError(u'Unable to extract video title')
 952         video_title = mobj.group(1).decode('utf-8')
 953
 954         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 955         if mobj is None:
 956             raise ExtractorError(u'Unable to extract video uploader')
 957         video_uploader = mobj.group(1).decode('utf-8')
 958
 959         # Extract video thumbnail
 960         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 961         if mobj is None:
 962             raise ExtractorError(u'Unable to extract video thumbnail')
 963         video_thumbnail = mobj.group(1).decode('utf-8')
 964
 965         # Extract video description
 966         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 967         if mobj is None:
 968             raise ExtractorError(u'Unable to extract video description')
 969         video_description = mobj.group(1).decode('utf-8')
 970         if not video_description:
 971             video_description = 'No description available.'
 972
 973         # Extract video height and width
 974         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 975         if mobj is None:
 976             raise ExtractorError(u'Unable to extract video height')
 977         yv_video_height = mobj.group(1)
 978
 979         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 980         if mobj is None:
 981             raise ExtractorError(u'Unable to extract video width')
 982         yv_video_width = mobj.group(1)
 983
 984         # Retrieve video playlist to extract media URL
 985         # I'm not completely sure what all these options are, but we
 986         # seem to need most of them, otherwise the server sends a 401.
 987         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 988         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 989         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 990                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 991                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 992         try:
 993             self.report_download_webpage(video_id)
 994             webpage = compat_urllib_request.urlopen(request).read()
 995         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 996             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
 997
 998         # Extract media URL from playlist XML
 999         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1000         if mobj is None:
1001             raise ExtractorError(u'Unable to extract media URL')
1002         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1003         video_url = unescapeHTML(video_url)
1004
1005         return [{
1006             'id':       video_id.decode('utf-8'),
1007             'url':      video_url,
1008             'uploader': video_uploader,
1009             'upload_date':  None,
1010             'title':    video_title,
1011             'ext':      video_extension.decode('utf-8'),
1012             'thumbnail':    video_thumbnail.decode('utf-8'),
1013             'description':  video_description,
1014         }]
1015
1016
1017 class VimeoIE(InfoExtractor):
1018     """Information extractor for vimeo.com."""
1019
1020     # _VALID_URL matches Vimeo URLs
1021     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1022     IE_NAME = u'vimeo'
1023
1024     def _real_extract(self, url, new_video=True):
1025         # Extract ID from URL
1026         mobj = re.match(self._VALID_URL, url)
1027         if mobj is None:
1028             raise ExtractorError(u'Invalid URL: %s' % url)
1029
1030         video_id = mobj.group('id')
1031         if not mobj.group('proto'):
1032             url = 'https://' + url
1033         if mobj.group('direct_link'):
1034             url = 'https://vimeo.com/' + video_id
1035
1036         # Retrieve video webpage to extract further information
1037         request = compat_urllib_request.Request(url, None, std_headers)
1038         webpage = self._download_webpage(request, video_id)
1039
1040         # Now we begin extracting as much information as we can from what we
1041         # retrieved. First we extract the information common to all extractors,
1042         # and latter we extract those that are Vimeo specific.
1043         self.report_extraction(video_id)
1044
1045         # Extract the config JSON
1046         try:
1047             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1048             config = json.loads(config)
1049         except:
1050             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1051                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1052             else:
1053                 raise ExtractorError(u'Unable to extract info section')
1054
1055         # Extract title
1056         video_title = config["video"]["title"]
1057
1058         # Extract uploader and uploader_id
1059         video_uploader = config["video"]["owner"]["name"]
1060         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1061
1062         # Extract video thumbnail
1063         video_thumbnail = config["video"]["thumbnail"]
1064
1065         # Extract video description
1066         video_description = get_element_by_attribute("itemprop", "description", webpage)
1067         if video_description: video_description = clean_html(video_description)
1068         else: video_description = u''
1069
1070         # Extract upload date
1071         video_upload_date = None
1072         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1073         if mobj is not None:
1074             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1075
1076         # Vimeo specific: extract request signature and timestamp
1077         sig = config['request']['signature']
1078         timestamp = config['request']['timestamp']
1079
1080         # Vimeo specific: extract video codec and quality information
1081         # First consider quality, then codecs, then take everything
1082         # TODO bind to format param
1083         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1084         files = { 'hd': [], 'sd': [], 'other': []}
1085         for codec_name, codec_extension in codecs:
1086             if codec_name in config["video"]["files"]:
1087                 if 'hd' in config["video"]["files"][codec_name]:
1088                     files['hd'].append((codec_name, codec_extension, 'hd'))
1089                 elif 'sd' in config["video"]["files"][codec_name]:
1090                     files['sd'].append((codec_name, codec_extension, 'sd'))
1091                 else:
1092                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1093
1094         for quality in ('hd', 'sd', 'other'):
1095             if len(files[quality]) > 0:
1096                 video_quality = files[quality][0][2]
1097                 video_codec = files[quality][0][0]
1098                 video_extension = files[quality][0][1]
1099                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1100                 break
1101         else:
1102             raise ExtractorError(u'No known codec found')
1103
1104         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1105                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1106
1107         return [{
1108             'id':       video_id,
1109             'url':      video_url,
1110             'uploader': video_uploader,
1111             'uploader_id': video_uploader_id,
1112             'upload_date':  video_upload_date,
1113             'title':    video_title,
1114             'ext':      video_extension,
1115             'thumbnail':    video_thumbnail,
1116             'description':  video_description,
1117         }]
1118
1119
1120 class ArteTvIE(InfoExtractor):
1121     """arte.tv information extractor."""
1122
1123     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1124     _LIVE_URL = r'index-[0-9]+\.html$'
1125
1126     IE_NAME = u'arte.tv'
1127
1128     def fetch_webpage(self, url):
1129         request = compat_urllib_request.Request(url)
1130         try:
1131             self.report_download_webpage(url)
1132             webpage = compat_urllib_request.urlopen(request).read()
1133         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1134             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1135         except ValueError as err:
1136             raise ExtractorError(u'Invalid URL: %s' % url)
1137         return webpage
1138
1139     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1140         page = self.fetch_webpage(url)
1141         mobj = re.search(regex, page, regexFlags)
1142         info = {}
1143
1144         if mobj is None:
1145             raise ExtractorError(u'Invalid URL: %s' % url)
1146
1147         for (i, key, err) in matchTuples:
1148             if mobj.group(i) is None:
1149                 raise ExtractorError(err)
1150             else:
1151                 info[key] = mobj.group(i)
1152
1153         return info
1154
1155     def extractLiveStream(self, url):
1156         video_lang = url.split('/')[-4]
1157         info = self.grep_webpage(
1158             url,
1159             r'src="(.*?/videothek_js.*?\.js)',
1160             0,
1161             [
1162                 (1, 'url', u'Invalid URL: %s' % url)
1163             ]
1164         )
1165         http_host = url.split('/')[2]
1166         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1167         info = self.grep_webpage(
1168             next_url,
1169             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1170                 '(http://.*?\.swf).*?' +
1171                 '(rtmp://.*?)\'',
1172             re.DOTALL,
1173             [
1174                 (1, 'path',   u'could not extract video path: %s' % url),
1175                 (2, 'player', u'could not extract video player: %s' % url),
1176                 (3, 'url',    u'could not extract video url: %s' % url)
1177             ]
1178         )
1179         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1180
1181     def extractPlus7Stream(self, url):
1182         video_lang = url.split('/')[-3]
1183         info = self.grep_webpage(
1184             url,
1185             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1186             0,
1187             [
1188                 (1, 'url', u'Invalid URL: %s' % url)
1189             ]
1190         )
1191         next_url = compat_urllib_parse.unquote(info.get('url'))
1192         info = self.grep_webpage(
1193             next_url,
1194             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1195             0,
1196             [
1197                 (1, 'url', u'Could not find <video> tag: %s' % url)
1198             ]
1199         )
1200         next_url = compat_urllib_parse.unquote(info.get('url'))
1201
1202         info = self.grep_webpage(
1203             next_url,
1204             r'<video id="(.*?)".*?>.*?' +
1205                 '<name>(.*?)</name>.*?' +
1206                 '<dateVideo>(.*?)</dateVideo>.*?' +
1207                 '<url quality="hd">(.*?)</url>',
1208             re.DOTALL,
1209             [
1210                 (1, 'id',    u'could not extract video id: %s' % url),
1211                 (2, 'title', u'could not extract video title: %s' % url),
1212                 (3, 'date',  u'could not extract video date: %s' % url),
1213                 (4, 'url',   u'could not extract video url: %s' % url)
1214             ]
1215         )
1216
1217         return {
1218             'id':           info.get('id'),
1219             'url':          compat_urllib_parse.unquote(info.get('url')),
1220             'uploader':     u'arte.tv',
1221             'upload_date':  unified_strdate(info.get('date')),
1222             'title':        info.get('title').decode('utf-8'),
1223             'ext':          u'mp4',
1224             'format':       u'NA',
1225             'player_url':   None,
1226         }
1227
1228     def _real_extract(self, url):
1229         video_id = url.split('/')[-1]
1230         self.report_extraction(video_id)
1231
1232         if re.search(self._LIVE_URL, video_id) is not None:
1233             self.extractLiveStream(url)
1234             return
1235         else:
1236             info = self.extractPlus7Stream(url)
1237
1238         return [info]
1239
1240
1241 class GenericIE(InfoExtractor):
1242     """Generic last-resort information extractor."""
1243
1244     _VALID_URL = r'.*'
1245     IE_NAME = u'generic'
1246
1247     def report_download_webpage(self, video_id):
1248         """Report webpage download."""
1249         if not self._downloader.params.get('test', False):
1250             self._downloader.report_warning(u'Falling back on generic information extractor.')
1251         super(GenericIE, self).report_download_webpage(video_id)
1252
1253     def report_following_redirect(self, new_url):
1254         """Report information extraction."""
1255         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1256
1257     def _test_redirect(self, url):
1258         """Check if it is a redirect, like url shorteners, in case return the new url."""
1259         class HeadRequest(compat_urllib_request.Request):
1260             def get_method(self):
1261                 return "HEAD"
1262
1263         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1264             """
1265             Subclass the HTTPRedirectHandler to make it use our
1266             HeadRequest also on the redirected URL
1267             """
1268             def redirect_request(self, req, fp, code, msg, headers, newurl):
1269                 if code in (301, 302, 303, 307):
1270                     newurl = newurl.replace(' ', '%20')
1271                     newheaders = dict((k,v) for k,v in req.headers.items()
1272                                       if k.lower() not in ("content-length", "content-type"))
1273                     return HeadRequest(newurl,
1274                                        headers=newheaders,
1275                                        origin_req_host=req.get_origin_req_host(),
1276                                        unverifiable=True)
1277                 else:
1278                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1279
1280         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1281             """
1282             Fallback to GET if HEAD is not allowed (405 HTTP error)
1283             """
1284             def http_error_405(self, req, fp, code, msg, headers):
1285                 fp.read()
1286                 fp.close()
1287
1288                 newheaders = dict((k,v) for k,v in req.headers.items()
1289                                   if k.lower() not in ("content-length", "content-type"))
1290                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1291                                                  headers=newheaders,
1292                                                  origin_req_host=req.get_origin_req_host(),
1293                                                  unverifiable=True))
1294
1295         # Build our opener
1296         opener = compat_urllib_request.OpenerDirector()
1297         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1298                         HTTPMethodFallback, HEADRedirectHandler,
1299                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1300             opener.add_handler(handler())
1301
1302         response = opener.open(HeadRequest(url))
1303         new_url = response.geturl()
1304
1305         if url == new_url:
1306             return False
1307
1308         self.report_following_redirect(new_url)
1309         return new_url
1310
1311     def _real_extract(self, url):
1312         new_url = self._test_redirect(url)
1313         if new_url: return [self.url_result(new_url)]
1314
1315         video_id = url.split('/')[-1]
1316         try:
1317             webpage = self._download_webpage(url, video_id)
1318         except ValueError as err:
1319             # since this is the last-resort InfoExtractor, if
1320             # this error is thrown, it'll be thrown here
1321             raise ExtractorError(u'Invalid URL: %s' % url)
1322
1323         self.report_extraction(video_id)
1324         # Start with something easy: JW Player in SWFObject
1325         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1326         if mobj is None:
1327             # Broaden the search a little bit
1328             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1329         if mobj is None:
1330             # Broaden the search a little bit: JWPlayer JS loader
1331             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1332         if mobj is None:
1333             raise ExtractorError(u'Invalid URL: %s' % url)
1334
1335         # It's possible that one of the regexes
1336         # matched, but returned an empty group:
1337         if mobj.group(1) is None:
1338             raise ExtractorError(u'Invalid URL: %s' % url)
1339
1340         video_url = compat_urllib_parse.unquote(mobj.group(1))
1341         video_id = os.path.basename(video_url)
1342
1343         # here's a fun little line of code for you:
1344         video_extension = os.path.splitext(video_id)[1][1:]
1345         video_id = os.path.splitext(video_id)[0]
1346
1347         # it's tempting to parse this further, but you would
1348         # have to take into account all the variations like
1349         #   Video Title - Site Name
1350         #   Site Name | Video Title
1351         #   Video Title - Tagline | Site Name
1352         # and so on and so forth; it's just not practical
1353         mobj = re.search(r'<title>(.*)</title>', webpage)
1354         if mobj is None:
1355             raise ExtractorError(u'Unable to extract title')
1356         video_title = mobj.group(1)
1357
1358         # video uploader is domain name
1359         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1360         if mobj is None:
1361             raise ExtractorError(u'Unable to extract title')
1362         video_uploader = mobj.group(1)
1363
1364         return [{
1365             'id':       video_id,
1366             'url':      video_url,
1367             'uploader': video_uploader,
1368             'upload_date':  None,
1369             'title':    video_title,
1370             'ext':      video_extension,
1371         }]
1372
1373
1374 class YoutubeSearchIE(InfoExtractor):
1375     """Information Extractor for YouTube search queries."""
1376     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1377     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1378     _max_youtube_results = 1000
1379     IE_NAME = u'youtube:search'
1380
1381     def report_download_page(self, query, pagenum):
1382         """Report attempt to download search page with given number."""
1383         query = query.decode(preferredencoding())
1384         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1385
1386     def _real_extract(self, query):
1387         mobj = re.match(self._VALID_URL, query)
1388         if mobj is None:
1389             raise ExtractorError(u'Invalid search query "%s"' % query)
1390
1391         prefix, query = query.split(':')
1392         prefix = prefix[8:]
1393         query = query.encode('utf-8')
1394         if prefix == '':
1395             return self._get_n_results(query, 1)
1396         elif prefix == 'all':
1397             self._get_n_results(query, self._max_youtube_results)
1398         else:
1399             try:
1400                 n = int(prefix)
1401                 if n <= 0:
1402                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1403                 elif n > self._max_youtube_results:
1404                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1405                     n = self._max_youtube_results
1406                 return self._get_n_results(query, n)
1407             except ValueError: # parsing prefix as integer fails
1408                 return self._get_n_results(query, 1)
1409
1410     def _get_n_results(self, query, n):
1411         """Get a specified number of results for a query"""
1412
1413         video_ids = []
1414         pagenum = 0
1415         limit = n
1416
1417         while (50 * pagenum) < limit:
1418             self.report_download_page(query, pagenum+1)
1419             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1420             request = compat_urllib_request.Request(result_url)
1421             try:
1422                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1423             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1424                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1425             api_response = json.loads(data)['data']
1426
1427             if not 'items' in api_response:
1428                 raise ExtractorError(u'[youtube] No video results')
1429
1430             new_ids = list(video['id'] for video in api_response['items'])
1431             video_ids += new_ids
1432
1433             limit = min(n, api_response['totalItems'])
1434             pagenum += 1
1435
1436         if len(video_ids) > n:
1437             video_ids = video_ids[:n]
1438         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1439         return videos
1440
1441
1442 class GoogleSearchIE(InfoExtractor):
1443     """Information Extractor for Google Video search queries."""
1444     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1445     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1446     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1447     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1448     _max_google_results = 1000
1449     IE_NAME = u'video.google:search'
1450
1451     def report_download_page(self, query, pagenum):
1452         """Report attempt to download playlist page with given number."""
1453         query = query.decode(preferredencoding())
1454         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1455
1456     def _real_extract(self, query):
1457         mobj = re.match(self._VALID_URL, query)
1458         if mobj is None:
1459             raise ExtractorError(u'Invalid search query "%s"' % query)
1460
1461         prefix, query = query.split(':')
1462         prefix = prefix[8:]
1463         query = query.encode('utf-8')
1464         if prefix == '':
1465             self._download_n_results(query, 1)
1466             return
1467         elif prefix == 'all':
1468             self._download_n_results(query, self._max_google_results)
1469             return
1470         else:
1471             try:
1472                 n = int(prefix)
1473                 if n <= 0:
1474                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1475                 elif n > self._max_google_results:
1476                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1477                     n = self._max_google_results
1478                 self._download_n_results(query, n)
1479                 return
1480             except ValueError: # parsing prefix as integer fails
1481                 self._download_n_results(query, 1)
1482                 return
1483
1484     def _download_n_results(self, query, n):
1485         """Downloads a specified number of results for a query"""
1486
1487         video_ids = []
1488         pagenum = 0
1489
1490         while True:
1491             self.report_download_page(query, pagenum)
1492             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1493             request = compat_urllib_request.Request(result_url)
1494             try:
1495                 page = compat_urllib_request.urlopen(request).read()
1496             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1497                 raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err))
1498
1499             # Extract video identifiers
1500             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1501                 video_id = mobj.group(1)
1502                 if video_id not in video_ids:
1503                     video_ids.append(video_id)
1504                     if len(video_ids) == n:
1505                         # Specified n videos reached
1506                         for id in video_ids:
1507                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1508                         return
1509
1510             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1511                 for id in video_ids:
1512                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1513                 return
1514
1515             pagenum = pagenum + 1
1516
1517
1518 class YahooSearchIE(InfoExtractor):
1519     """Information Extractor for Yahoo! Video search queries."""
1520
1521     _WORKING = False
1522     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1523     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1524     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1525     _MORE_PAGES_INDICATOR = r'\s*Next'
1526     _max_yahoo_results = 1000
1527     IE_NAME = u'video.yahoo:search'
1528
1529     def report_download_page(self, query, pagenum):
1530         """Report attempt to download playlist page with given number."""
1531         query = query.decode(preferredencoding())
1532         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1533
1534     def _real_extract(self, query):
1535         mobj = re.match(self._VALID_URL, query)
1536         if mobj is None:
1537             raise ExtractorError(u'Invalid search query "%s"' % query)
1538
1539         prefix, query = query.split(':')
1540         prefix = prefix[8:]
1541         query = query.encode('utf-8')
1542         if prefix == '':
1543             self._download_n_results(query, 1)
1544             return
1545         elif prefix == 'all':
1546             self._download_n_results(query, self._max_yahoo_results)
1547             return
1548         else:
1549             try:
1550                 n = int(prefix)
1551                 if n <= 0:
1552                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1553                 elif n > self._max_yahoo_results:
1554                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1555                     n = self._max_yahoo_results
1556                 self._download_n_results(query, n)
1557                 return
1558             except ValueError: # parsing prefix as integer fails
1559                 self._download_n_results(query, 1)
1560                 return
1561
1562     def _download_n_results(self, query, n):
1563         """Downloads a specified number of results for a query"""
1564
1565         video_ids = []
1566         already_seen = set()
1567         pagenum = 1
1568
1569         while True:
1570             self.report_download_page(query, pagenum)
1571             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1572             request = compat_urllib_request.Request(result_url)
1573             try:
1574                 page = compat_urllib_request.urlopen(request).read()
1575             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1576                 raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err))
1577
1578             # Extract video identifiers
1579             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1580                 video_id = mobj.group(1)
1581                 if video_id not in already_seen:
1582                     video_ids.append(video_id)
1583                     already_seen.add(video_id)
1584                     if len(video_ids) == n:
1585                         # Specified n videos reached
1586                         for id in video_ids:
1587                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1588                         return
1589
1590             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1591                 for id in video_ids:
1592                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1593                 return
1594
1595             pagenum = pagenum + 1
1596
1597
1598 class YoutubePlaylistIE(InfoExtractor):
1599     """Information Extractor for YouTube playlists."""
1600
1601     _VALID_URL = r"""(?:
1602                         (?:https?://)?
1603                         (?:\w+\.)?
1604                         youtube\.com/
1605                         (?:
1606                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1607                            \? (?:.*?&)*? (?:p|a|list)=
1608                         |  p/
1609                         )
1610                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1611                         .*
1612                      |
1613                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1614                      )"""
1615     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1616     _MAX_RESULTS = 50
1617     IE_NAME = u'youtube:playlist'
1618
1619     @classmethod
1620     def suitable(cls, url):
1621         """Receives a URL and returns True if suitable for this IE."""
1622         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1623
1624     def _real_extract(self, url):
1625         # Extract playlist id
1626         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1627         if mobj is None:
1628             raise ExtractorError(u'Invalid URL: %s' % url)
1629
1630         # Download playlist videos from API
1631         playlist_id = mobj.group(1) or mobj.group(2)
1632         page_num = 1
1633         videos = []
1634
1635         while True:
1636             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1637             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1638
1639             try:
1640                 response = json.loads(page)
1641             except ValueError as err:
1642                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1643
1644             if 'feed' not in response:
1645                 raise ExtractorError(u'Got a malformed response from YouTube API')
1646             playlist_title = response['feed']['title']['$t']
1647             if 'entry' not in response['feed']:
1648                 # Number of videos is a multiple of self._MAX_RESULTS
1649                 break
1650
1651             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1652                         for entry in response['feed']['entry']
1653                         if 'content' in entry ]
1654
1655             if len(response['feed']['entry']) < self._MAX_RESULTS:
1656                 break
1657             page_num += 1
1658
1659         videos = [v[1] for v in sorted(videos)]
1660
1661         url_results = [self.url_result(url, 'Youtube') for url in videos]
1662         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1663
1664
1665 class YoutubeChannelIE(InfoExtractor):
1666     """Information Extractor for YouTube channels."""
1667
1668     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1669     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1670     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1671     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1672     IE_NAME = u'youtube:channel'
1673
1674     def extract_videos_from_page(self, page):
1675         ids_in_page = []
1676         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1677             if mobj.group(1) not in ids_in_page:
1678                 ids_in_page.append(mobj.group(1))
1679         return ids_in_page
1680
1681     def _real_extract(self, url):
1682         # Extract channel id
1683         mobj = re.match(self._VALID_URL, url)
1684         if mobj is None:
1685             raise ExtractorError(u'Invalid URL: %s' % url)
1686
1687         # Download channel page
1688         channel_id = mobj.group(1)
1689         video_ids = []
1690         pagenum = 1
1691
1692         url = self._TEMPLATE_URL % (channel_id, pagenum)
1693         page = self._download_webpage(url, channel_id,
1694                                       u'Downloading page #%s' % pagenum)
1695
1696         # Extract video identifiers
1697         ids_in_page = self.extract_videos_from_page(page)
1698         video_ids.extend(ids_in_page)
1699
1700         # Download any subsequent channel pages using the json-based channel_ajax query
1701         if self._MORE_PAGES_INDICATOR in page:
1702             while True:
1703                 pagenum = pagenum + 1
1704
1705                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1706                 page = self._download_webpage(url, channel_id,
1707                                               u'Downloading page #%s' % pagenum)
1708
1709                 page = json.loads(page)
1710
1711                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1712                 video_ids.extend(ids_in_page)
1713
1714                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1715                     break
1716
1717         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1718
1719         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1720         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1721         return [self.playlist_result(url_entries, channel_id)]
1722
1723
1724 class YoutubeUserIE(InfoExtractor):
1725     """Information Extractor for YouTube users."""
1726
1727     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1728     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1729     _GDATA_PAGE_SIZE = 50
1730     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1731     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1732     IE_NAME = u'youtube:user'
1733
1734     def _real_extract(self, url):
1735         # Extract username
1736         mobj = re.match(self._VALID_URL, url)
1737         if mobj is None:
1738             raise ExtractorError(u'Invalid URL: %s' % url)
1739
1740         username = mobj.group(1)
1741
1742         # Download video ids using YouTube Data API. Result size per
1743         # query is limited (currently to 50 videos) so we need to query
1744         # page by page until there are no video ids - it means we got
1745         # all of them.
1746
1747         video_ids = []
1748         pagenum = 0
1749
1750         while True:
1751             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1752
1753             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1754             page = self._download_webpage(gdata_url, username,
1755                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1756
1757             # Extract video identifiers
1758             ids_in_page = []
1759
1760             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1761                 if mobj.group(1) not in ids_in_page:
1762                     ids_in_page.append(mobj.group(1))
1763
1764             video_ids.extend(ids_in_page)
1765
1766             # A little optimization - if current page is not
1767             # "full", ie. does not contain PAGE_SIZE video ids then
1768             # we can assume that this page is the last one - there
1769             # are no more ids on further pages - no need to query
1770             # again.
1771
1772             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1773                 break
1774
1775             pagenum += 1
1776
1777         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1778         url_results = [self.url_result(url, 'Youtube') for url in urls]
1779         return [self.playlist_result(url_results, playlist_title = username)]
1780
1781
1782 class BlipTVUserIE(InfoExtractor):
1783     """Information Extractor for blip.tv users."""
1784
1785     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1786     _PAGE_SIZE = 12
1787     IE_NAME = u'blip.tv:user'
1788
1789     def _real_extract(self, url):
1790         # Extract username
1791         mobj = re.match(self._VALID_URL, url)
1792         if mobj is None:
1793             raise ExtractorError(u'Invalid URL: %s' % url)
1794
1795         username = mobj.group(1)
1796
1797         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1798
1799         page = self._download_webpage(url, username, u'Downloading user page')
1800         mobj = re.search(r'data-users-id="([^"]+)"', page)
1801         page_base = page_base % mobj.group(1)
1802
1803
1804         # Download video ids using BlipTV Ajax calls. Result size per
1805         # query is limited (currently to 12 videos) so we need to query
1806         # page by page until there are no video ids - it means we got
1807         # all of them.
1808
1809         video_ids = []
1810         pagenum = 1
1811
1812         while True:
1813             url = page_base + "&page=" + str(pagenum)
1814             page = self._download_webpage(url, username,
1815                                           u'Downloading video ids from page %d' % pagenum)
1816
1817             # Extract video identifiers
1818             ids_in_page = []
1819
1820             for mobj in re.finditer(r'href="/([^"]+)"', page):
1821                 if mobj.group(1) not in ids_in_page:
1822                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1823
1824             video_ids.extend(ids_in_page)
1825
1826             # A little optimization - if current page is not
1827             # "full", ie. does not contain PAGE_SIZE video ids then
1828             # we can assume that this page is the last one - there
1829             # are no more ids on further pages - no need to query
1830             # again.
1831
1832             if len(ids_in_page) < self._PAGE_SIZE:
1833                 break
1834
1835             pagenum += 1
1836
1837         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1838         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1839         return [self.playlist_result(url_entries, playlist_title = username)]
1840
1841
1842 class DepositFilesIE(InfoExtractor):
1843     """Information extractor for depositfiles.com"""
1844
1845     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1846
1847     def _real_extract(self, url):
1848         file_id = url.split('/')[-1]
1849         # Rebuild url in english locale
1850         url = 'http://depositfiles.com/en/files/' + file_id
1851
1852         # Retrieve file webpage with 'Free download' button pressed
1853         free_download_indication = { 'gateway_result' : '1' }
1854         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1855         try:
1856             self.report_download_webpage(file_id)
1857             webpage = compat_urllib_request.urlopen(request).read()
1858         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1859             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1860
1861         # Search for the real file URL
1862         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1863         if (mobj is None) or (mobj.group(1) is None):
1864             # Try to figure out reason of the error.
1865             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1866             if (mobj is not None) and (mobj.group(1) is not None):
1867                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1868                 raise ExtractorError(u'%s' % restriction_message)
1869             else:
1870                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1871
1872         file_url = mobj.group(1)
1873         file_extension = os.path.splitext(file_url)[1][1:]
1874
1875         # Search for file title
1876         mobj = re.search(r'<b title="(.*?)">', webpage)
1877         if mobj is None:
1878             raise ExtractorError(u'Unable to extract title')
1879         file_title = mobj.group(1).decode('utf-8')
1880
1881         return [{
1882             'id':       file_id.decode('utf-8'),
1883             'url':      file_url.decode('utf-8'),
1884             'uploader': None,
1885             'upload_date':  None,
1886             'title':    file_title,
1887             'ext':      file_extension.decode('utf-8'),
1888         }]
1889
1890
1891 class FacebookIE(InfoExtractor):
1892     """Information Extractor for Facebook"""
1893
1894     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1895     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1896     _NETRC_MACHINE = 'facebook'
1897     IE_NAME = u'facebook'
1898
1899     def report_login(self):
1900         """Report attempt to log in."""
1901         self.to_screen(u'Logging in')
1902
1903     def _real_initialize(self):
1904         if self._downloader is None:
1905             return
1906
1907         useremail = None
1908         password = None
1909         downloader_params = self._downloader.params
1910
1911         # Attempt to use provided username and password or .netrc data
1912         if downloader_params.get('username', None) is not None:
1913             useremail = downloader_params['username']
1914             password = downloader_params['password']
1915         elif downloader_params.get('usenetrc', False):
1916             try:
1917                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1918                 if info is not None:
1919                     useremail = info[0]
1920                     password = info[2]
1921                 else:
1922                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1923             except (IOError, netrc.NetrcParseError) as err:
1924                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1925                 return
1926
1927         if useremail is None:
1928             return
1929
1930         # Log in
1931         login_form = {
1932             'email': useremail,
1933             'pass': password,
1934             'login': 'Log+In'
1935             }
1936         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1937         try:
1938             self.report_login()
1939             login_results = compat_urllib_request.urlopen(request).read()
1940             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1941                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1942                 return
1943         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1944             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1945             return
1946
1947     def _real_extract(self, url):
1948         mobj = re.match(self._VALID_URL, url)
1949         if mobj is None:
1950             raise ExtractorError(u'Invalid URL: %s' % url)
1951         video_id = mobj.group('ID')
1952
1953         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1954         webpage = self._download_webpage(url, video_id)
1955
1956         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1957         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1958         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1959         if not m:
1960             raise ExtractorError(u'Cannot parse data')
1961         data = dict(json.loads(m.group(1)))
1962         params_raw = compat_urllib_parse.unquote(data['params'])
1963         params = json.loads(params_raw)
1964         video_data = params['video_data'][0]
1965         video_url = video_data.get('hd_src')
1966         if not video_url:
1967             video_url = video_data['sd_src']
1968         if not video_url:
1969             raise ExtractorError(u'Cannot find video URL')
1970         video_duration = int(video_data['video_duration'])
1971         thumbnail = video_data['thumbnail_src']
1972
1973         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1974         if not m:
1975             raise ExtractorError(u'Cannot find title in webpage')
1976         video_title = unescapeHTML(m.group(1))
1977
1978         info = {
1979             'id': video_id,
1980             'title': video_title,
1981             'url': video_url,
1982             'ext': 'mp4',
1983             'duration': video_duration,
1984             'thumbnail': thumbnail,
1985         }
1986         return [info]
1987
1988
1989 class BlipTVIE(InfoExtractor):
1990     """Information extractor for blip.tv"""
1991
1992     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1993     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1994     IE_NAME = u'blip.tv'
1995
1996     def report_direct_download(self, title):
1997         """Report information extraction."""
1998         self.to_screen(u'%s: Direct download detected' % title)
1999
2000     def _real_extract(self, url):
2001         mobj = re.match(self._VALID_URL, url)
2002         if mobj is None:
2003             raise ExtractorError(u'Invalid URL: %s' % url)
2004
2005         urlp = compat_urllib_parse_urlparse(url)
2006         if urlp.path.startswith('/play/'):
2007             request = compat_urllib_request.Request(url)
2008             response = compat_urllib_request.urlopen(request)
2009             redirecturl = response.geturl()
2010             rurlp = compat_urllib_parse_urlparse(redirecturl)
2011             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2012             url = 'http://blip.tv/a/a-' + file_id
2013             return self._real_extract(url)
2014
2015
2016         if '?' in url:
2017             cchar = '&'
2018         else:
2019             cchar = '?'
2020         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2021         request = compat_urllib_request.Request(json_url)
2022         request.add_header('User-Agent', 'iTunes/10.6.1')
2023         self.report_extraction(mobj.group(1))
2024         info = None
2025         try:
2026             urlh = compat_urllib_request.urlopen(request)
2027             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2028                 basename = url.split('/')[-1]
2029                 title,ext = os.path.splitext(basename)
2030                 title = title.decode('UTF-8')
2031                 ext = ext.replace('.', '')
2032                 self.report_direct_download(title)
2033                 info = {
2034                     'id': title,
2035                     'url': url,
2036                     'uploader': None,
2037                     'upload_date': None,
2038                     'title': title,
2039                     'ext': ext,
2040                     'urlhandle': urlh
2041                 }
2042         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2043             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2044         if info is None: # Regular URL
2045             try:
2046                 json_code_bytes = urlh.read()
2047                 json_code = json_code_bytes.decode('utf-8')
2048             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2049                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2050
2051             try:
2052                 json_data = json.loads(json_code)
2053                 if 'Post' in json_data:
2054                     data = json_data['Post']
2055                 else:
2056                     data = json_data
2057
2058                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2059                 video_url = data['media']['url']
2060                 umobj = re.match(self._URL_EXT, video_url)
2061                 if umobj is None:
2062                     raise ValueError('Can not determine filename extension')
2063                 ext = umobj.group(1)
2064
2065                 info = {
2066                     'id': data['item_id'],
2067                     'url': video_url,
2068                     'uploader': data['display_name'],
2069                     'upload_date': upload_date,
2070                     'title': data['title'],
2071                     'ext': ext,
2072                     'format': data['media']['mimeType'],
2073                     'thumbnail': data['thumbnailUrl'],
2074                     'description': data['description'],
2075                     'player_url': data['embedUrl'],
2076                     'user_agent': 'iTunes/10.6.1',
2077                 }
2078             except (ValueError,KeyError) as err:
2079                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2080
2081         return [info]
2082
2083
2084 class MyVideoIE(InfoExtractor):
2085     """Information Extractor for myvideo.de."""
2086
2087     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2088     IE_NAME = u'myvideo'
2089
2090     def _real_extract(self,url):
2091         mobj = re.match(self._VALID_URL, url)
2092         if mobj is None:
2093             raise ExtractorError(u'Invalid URL: %s' % url)
2094
2095         video_id = mobj.group(1)
2096
2097         # Get video webpage
2098         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2099         webpage = self._download_webpage(webpage_url, video_id)
2100
2101         self.report_extraction(video_id)
2102         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2103                  webpage)
2104         if mobj is None:
2105             raise ExtractorError(u'Unable to extract media URL')
2106         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2107
2108         mobj = re.search('<title>([^<]+)</title>', webpage)
2109         if mobj is None:
2110             raise ExtractorError(u'Unable to extract title')
2111
2112         video_title = mobj.group(1)
2113
2114         return [{
2115             'id':       video_id,
2116             'url':      video_url,
2117             'uploader': None,
2118             'upload_date':  None,
2119             'title':    video_title,
2120             'ext':      u'flv',
2121         }]
2122
2123 class ComedyCentralIE(InfoExtractor):
2124     """Information extractor for The Daily Show and Colbert Report """
2125
2126     # urls can be abbreviations like :thedailyshow or :colbert
2127     # urls for episodes like:
2128     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2129     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2130     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2131     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2132                       |(https?://)?(www\.)?
2133                           (?P<showname>thedailyshow|colbertnation)\.com/
2134                          (full-episodes/(?P<episode>.*)|
2135                           (?P<clip>
2136                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2137                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2138                      $"""
2139
2140     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2141
2142     _video_extensions = {
2143         '3500': 'mp4',
2144         '2200': 'mp4',
2145         '1700': 'mp4',
2146         '1200': 'mp4',
2147         '750': 'mp4',
2148         '400': 'mp4',
2149     }
2150     _video_dimensions = {
2151         '3500': '1280x720',
2152         '2200': '960x540',
2153         '1700': '768x432',
2154         '1200': '640x360',
2155         '750': '512x288',
2156         '400': '384x216',
2157     }
2158
2159     @classmethod
2160     def suitable(cls, url):
2161         """Receives a URL and returns True if suitable for this IE."""
2162         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2163
2164     def _print_formats(self, formats):
2165         print('Available formats:')
2166         for x in formats:
2167             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2168
2169
2170     def _real_extract(self, url):
2171         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2172         if mobj is None:
2173             raise ExtractorError(u'Invalid URL: %s' % url)
2174
2175         if mobj.group('shortname'):
2176             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2177                 url = u'http://www.thedailyshow.com/full-episodes/'
2178             else:
2179                 url = u'http://www.colbertnation.com/full-episodes/'
2180             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2181             assert mobj is not None
2182
2183         if mobj.group('clip'):
2184             if mobj.group('showname') == 'thedailyshow':
2185                 epTitle = mobj.group('tdstitle')
2186             else:
2187                 epTitle = mobj.group('cntitle')
2188             dlNewest = False
2189         else:
2190             dlNewest = not mobj.group('episode')
2191             if dlNewest:
2192                 epTitle = mobj.group('showname')
2193             else:
2194                 epTitle = mobj.group('episode')
2195
2196         self.report_extraction(epTitle)
2197         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2198         if dlNewest:
2199             url = htmlHandle.geturl()
2200             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2201             if mobj is None:
2202                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2203             if mobj.group('episode') == '':
2204                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2205             epTitle = mobj.group('episode')
2206
2207         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2208
2209         if len(mMovieParams) == 0:
2210             # The Colbert Report embeds the information in a without
2211             # a URL prefix; so extract the alternate reference
2212             # and then add the URL prefix manually.
2213
2214             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2215             if len(altMovieParams) == 0:
2216                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2217             else:
2218                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2219
2220         uri = mMovieParams[0][1]
2221         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2222         indexXml = self._download_webpage(indexUrl, epTitle,
2223                                           u'Downloading show index',
2224                                           u'unable to download episode index')
2225
2226         results = []
2227
2228         idoc = xml.etree.ElementTree.fromstring(indexXml)
2229         itemEls = idoc.findall('.//item')
2230         for partNum,itemEl in enumerate(itemEls):
2231             mediaId = itemEl.findall('./guid')[0].text
2232             shortMediaId = mediaId.split(':')[-1]
2233             showId = mediaId.split(':')[-2].replace('.com', '')
2234             officialTitle = itemEl.findall('./title')[0].text
2235             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2236
2237             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2238                         compat_urllib_parse.urlencode({'uri': mediaId}))
2239             configXml = self._download_webpage(configUrl, epTitle,
2240                                                u'Downloading configuration for %s' % shortMediaId)
2241
2242             cdoc = xml.etree.ElementTree.fromstring(configXml)
2243             turls = []
2244             for rendition in cdoc.findall('.//rendition'):
2245                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2246                 turls.append(finfo)
2247
2248             if len(turls) == 0:
2249                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2250                 continue
2251
2252             if self._downloader.params.get('listformats', None):
2253                 self._print_formats([i[0] for i in turls])
2254                 return
2255
2256             # For now, just pick the highest bitrate
2257             format,rtmp_video_url = turls[-1]
2258
2259             # Get the format arg from the arg stream
2260             req_format = self._downloader.params.get('format', None)
2261
2262             # Select format if we can find one
2263             for f,v in turls:
2264                 if f == req_format:
2265                     format, rtmp_video_url = f, v
2266                     break
2267
2268             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2269             if not m:
2270                 raise ExtractorError(u'Cannot transform RTMP url')
2271             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2272             video_url = base + m.group('finalid')
2273
2274             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2275             info = {
2276                 'id': shortMediaId,
2277                 'url': video_url,
2278                 'uploader': showId,
2279                 'upload_date': officialDate,
2280                 'title': effTitle,
2281                 'ext': 'mp4',
2282                 'format': format,
2283                 'thumbnail': None,
2284                 'description': officialTitle,
2285             }
2286             results.append(info)
2287
2288         return results
2289
2290
2291 class EscapistIE(InfoExtractor):
2292     """Information extractor for The Escapist """
2293
2294     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2295     IE_NAME = u'escapist'
2296
2297     def _real_extract(self, url):
2298         mobj = re.match(self._VALID_URL, url)
2299         if mobj is None:
2300             raise ExtractorError(u'Invalid URL: %s' % url)
2301         showName = mobj.group('showname')
2302         videoId = mobj.group('episode')
2303
2304         self.report_extraction(showName)
2305         webPage = self._download_webpage(url, showName)
2306
2307         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2308         description = unescapeHTML(descMatch.group(1))
2309         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2310         imgUrl = unescapeHTML(imgMatch.group(1))
2311         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2312         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2313         configUrlMatch = re.search('config=(.*)$', playerUrl)
2314         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2315
2316         configJSON = self._download_webpage(configUrl, showName,
2317                                             u'Downloading configuration',
2318                                             u'unable to download configuration')
2319
2320         # Technically, it's JavaScript, not JSON
2321         configJSON = configJSON.replace("'", '"')
2322
2323         try:
2324             config = json.loads(configJSON)
2325         except (ValueError,) as err:
2326             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2327
2328         playlist = config['playlist']
2329         videoUrl = playlist[1]['url']
2330
2331         info = {
2332             'id': videoId,
2333             'url': videoUrl,
2334             'uploader': showName,
2335             'upload_date': None,
2336             'title': showName,
2337             'ext': 'mp4',
2338             'thumbnail': imgUrl,
2339             'description': description,
2340             'player_url': playerUrl,
2341         }
2342
2343         return [info]
2344
2345 class CollegeHumorIE(InfoExtractor):
2346     """Information extractor for collegehumor.com"""
2347
2348     _WORKING = False
2349     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2350     IE_NAME = u'collegehumor'
2351
2352     def report_manifest(self, video_id):
2353         """Report information extraction."""
2354         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2355
2356     def _real_extract(self, url):
2357         mobj = re.match(self._VALID_URL, url)
2358         if mobj is None:
2359             raise ExtractorError(u'Invalid URL: %s' % url)
2360         video_id = mobj.group('videoid')
2361
2362         info = {
2363             'id': video_id,
2364             'uploader': None,
2365             'upload_date': None,
2366         }
2367
2368         self.report_extraction(video_id)
2369         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2370         try:
2371             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2372         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2373             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2374
2375         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2376         try:
2377             videoNode = mdoc.findall('./video')[0]
2378             info['description'] = videoNode.findall('./description')[0].text
2379             info['title'] = videoNode.findall('./caption')[0].text
2380             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2381             manifest_url = videoNode.findall('./file')[0].text
2382         except IndexError:
2383             raise ExtractorError(u'Invalid metadata XML file')
2384
2385         manifest_url += '?hdcore=2.10.3'
2386         self.report_manifest(video_id)
2387         try:
2388             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2389         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2390             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2391
2392         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2393         try:
2394             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2395             node_id = media_node.attrib['url']
2396             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2397         except IndexError as err:
2398             raise ExtractorError(u'Invalid manifest file')
2399
2400         url_pr = compat_urllib_parse_urlparse(manifest_url)
2401         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2402
2403         info['url'] = url
2404         info['ext'] = 'f4f'
2405         return [info]
2406
2407
2408 class XVideosIE(InfoExtractor):
2409     """Information extractor for xvideos.com"""
2410
2411     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2412     IE_NAME = u'xvideos'
2413
2414     def _real_extract(self, url):
2415         mobj = re.match(self._VALID_URL, url)
2416         if mobj is None:
2417             raise ExtractorError(u'Invalid URL: %s' % url)
2418         video_id = mobj.group(1)
2419
2420         webpage = self._download_webpage(url, video_id)
2421
2422         self.report_extraction(video_id)
2423
2424
2425         # Extract video URL
2426         mobj = re.search(r'flv_url=(.+?)&', webpage)
2427         if mobj is None:
2428             raise ExtractorError(u'Unable to extract video url')
2429         video_url = compat_urllib_parse.unquote(mobj.group(1))
2430
2431
2432         # Extract title
2433         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2434         if mobj is None:
2435             raise ExtractorError(u'Unable to extract video title')
2436         video_title = mobj.group(1)
2437
2438
2439         # Extract video thumbnail
2440         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2441         if mobj is None:
2442             raise ExtractorError(u'Unable to extract video thumbnail')
2443         video_thumbnail = mobj.group(0)
2444
2445         info = {
2446             'id': video_id,
2447             'url': video_url,
2448             'uploader': None,
2449             'upload_date': None,
2450             'title': video_title,
2451             'ext': 'flv',
2452             'thumbnail': video_thumbnail,
2453             'description': None,
2454         }
2455
2456         return [info]
2457
2458
2459 class SoundcloudIE(InfoExtractor):
2460     """Information extractor for soundcloud.com
2461        To access the media, the uid of the song and a stream token
2462        must be extracted from the page source and the script must make
2463        a request to media.soundcloud.com/crossdomain.xml. Then
2464        the media can be grabbed by requesting from an url composed
2465        of the stream token and uid
2466      """
2467
2468     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2469     IE_NAME = u'soundcloud'
2470
2471     def report_resolve(self, video_id):
2472         """Report information extraction."""
2473         self.to_screen(u'%s: Resolving id' % video_id)
2474
2475     def _real_extract(self, url):
2476         mobj = re.match(self._VALID_URL, url)
2477         if mobj is None:
2478             raise ExtractorError(u'Invalid URL: %s' % url)
2479
2480         # extract uploader (which is in the url)
2481         uploader = mobj.group(1)
2482         # extract simple title (uploader + slug of song title)
2483         slug_title =  mobj.group(2)
2484         simple_title = uploader + u'-' + slug_title
2485         full_title = '%s/%s' % (uploader, slug_title)
2486
2487         self.report_resolve(full_title)
2488
2489         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2490         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2491         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2492
2493         info = json.loads(info_json)
2494         video_id = info['id']
2495         self.report_extraction(full_title)
2496
2497         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2498         stream_json = self._download_webpage(streams_url, full_title,
2499                                              u'Downloading stream definitions',
2500                                              u'unable to download stream definitions')
2501
2502         streams = json.loads(stream_json)
2503         mediaURL = streams['http_mp3_128_url']
2504         upload_date = unified_strdate(info['created_at'])
2505
2506         return [{
2507             'id':       info['id'],
2508             'url':      mediaURL,
2509             'uploader': info['user']['username'],
2510             'upload_date': upload_date,
2511             'title':    info['title'],
2512             'ext':      u'mp3',
2513             'description': info['description'],
2514         }]
2515
2516 class SoundcloudSetIE(InfoExtractor):
2517     """Information extractor for soundcloud.com sets
2518        To access the media, the uid of the song and a stream token
2519        must be extracted from the page source and the script must make
2520        a request to media.soundcloud.com/crossdomain.xml. Then
2521        the media can be grabbed by requesting from an url composed
2522        of the stream token and uid
2523      """
2524
2525     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2526     IE_NAME = u'soundcloud:set'
2527
2528     def report_resolve(self, video_id):
2529         """Report information extraction."""
2530         self.to_screen(u'%s: Resolving id' % video_id)
2531
2532     def _real_extract(self, url):
2533         mobj = re.match(self._VALID_URL, url)
2534         if mobj is None:
2535             raise ExtractorError(u'Invalid URL: %s' % url)
2536
2537         # extract uploader (which is in the url)
2538         uploader = mobj.group(1)
2539         # extract simple title (uploader + slug of song title)
2540         slug_title =  mobj.group(2)
2541         simple_title = uploader + u'-' + slug_title
2542         full_title = '%s/sets/%s' % (uploader, slug_title)
2543
2544         self.report_resolve(full_title)
2545
2546         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2547         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2548         info_json = self._download_webpage(resolv_url, full_title)
2549
2550         videos = []
2551         info = json.loads(info_json)
2552         if 'errors' in info:
2553             for err in info['errors']:
2554                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2555             return
2556
2557         self.report_extraction(full_title)
2558         for track in info['tracks']:
2559             video_id = track['id']
2560
2561             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2562             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2563
2564             self.report_extraction(video_id)
2565             streams = json.loads(stream_json)
2566             mediaURL = streams['http_mp3_128_url']
2567
2568             videos.append({
2569                 'id':       video_id,
2570                 'url':      mediaURL,
2571                 'uploader': track['user']['username'],
2572                 'upload_date':  unified_strdate(track['created_at']),
2573                 'title':    track['title'],
2574                 'ext':      u'mp3',
2575                 'description': track['description'],
2576             })
2577         return videos
2578
2579
2580 class InfoQIE(InfoExtractor):
2581     """Information extractor for infoq.com"""
2582     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2583
2584     def _real_extract(self, url):
2585         mobj = re.match(self._VALID_URL, url)
2586         if mobj is None:
2587             raise ExtractorError(u'Invalid URL: %s' % url)
2588
2589         webpage = self._download_webpage(url, video_id=url)
2590         self.report_extraction(url)
2591
2592         # Extract video URL
2593         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2594         if mobj is None:
2595             raise ExtractorError(u'Unable to extract video url')
2596         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2597         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2598
2599         # Extract title
2600         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2601         if mobj is None:
2602             raise ExtractorError(u'Unable to extract video title')
2603         video_title = mobj.group(1)
2604
2605         # Extract description
2606         video_description = u'No description available.'
2607         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2608         if mobj is not None:
2609             video_description = mobj.group(1)
2610
2611         video_filename = video_url.split('/')[-1]
2612         video_id, extension = video_filename.split('.')
2613
2614         info = {
2615             'id': video_id,
2616             'url': video_url,
2617             'uploader': None,
2618             'upload_date': None,
2619             'title': video_title,
2620             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2621             'thumbnail': None,
2622             'description': video_description,
2623         }
2624
2625         return [info]
2626
2627 class MixcloudIE(InfoExtractor):
2628     """Information extractor for www.mixcloud.com"""
2629
2630     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2631     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2632     IE_NAME = u'mixcloud'
2633
2634     def report_download_json(self, file_id):
2635         """Report JSON download."""
2636         self.to_screen(u'Downloading json')
2637
2638     def get_urls(self, jsonData, fmt, bitrate='best'):
2639         """Get urls from 'audio_formats' section in json"""
2640         file_url = None
2641         try:
2642             bitrate_list = jsonData[fmt]
2643             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2644                 bitrate = max(bitrate_list) # select highest
2645
2646             url_list = jsonData[fmt][bitrate]
2647         except TypeError: # we have no bitrate info.
2648             url_list = jsonData[fmt]
2649         return url_list
2650
2651     def check_urls(self, url_list):
2652         """Returns 1st active url from list"""
2653         for url in url_list:
2654             try:
2655                 compat_urllib_request.urlopen(url)
2656                 return url
2657             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2658                 url = None
2659
2660         return None
2661
2662     def _print_formats(self, formats):
2663         print('Available formats:')
2664         for fmt in formats.keys():
2665             for b in formats[fmt]:
2666                 try:
2667                     ext = formats[fmt][b][0]
2668                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2669                 except TypeError: # we have no bitrate info
2670                     ext = formats[fmt][0]
2671                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2672                     break
2673
2674     def _real_extract(self, url):
2675         mobj = re.match(self._VALID_URL, url)
2676         if mobj is None:
2677             raise ExtractorError(u'Invalid URL: %s' % url)
2678         # extract uploader & filename from url
2679         uploader = mobj.group(1).decode('utf-8')
2680         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2681
2682         # construct API request
2683         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2684         # retrieve .json file with links to files
2685         request = compat_urllib_request.Request(file_url)
2686         try:
2687             self.report_download_json(file_url)
2688             jsonData = compat_urllib_request.urlopen(request).read()
2689         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2690             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2691
2692         # parse JSON
2693         json_data = json.loads(jsonData)
2694         player_url = json_data['player_swf_url']
2695         formats = dict(json_data['audio_formats'])
2696
2697         req_format = self._downloader.params.get('format', None)
2698         bitrate = None
2699
2700         if self._downloader.params.get('listformats', None):
2701             self._print_formats(formats)
2702             return
2703
2704         if req_format is None or req_format == 'best':
2705             for format_param in formats.keys():
2706                 url_list = self.get_urls(formats, format_param)
2707                 # check urls
2708                 file_url = self.check_urls(url_list)
2709                 if file_url is not None:
2710                     break # got it!
2711         else:
2712             if req_format not in formats:
2713                 raise ExtractorError(u'Format is not available')
2714
2715             url_list = self.get_urls(formats, req_format)
2716             file_url = self.check_urls(url_list)
2717             format_param = req_format
2718
2719         return [{
2720             'id': file_id.decode('utf-8'),
2721             'url': file_url.decode('utf-8'),
2722             'uploader': uploader.decode('utf-8'),
2723             'upload_date': None,
2724             'title': json_data['name'],
2725             'ext': file_url.split('.')[-1].decode('utf-8'),
2726             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2727             'thumbnail': json_data['thumbnail_url'],
2728             'description': json_data['description'],
2729             'player_url': player_url.decode('utf-8'),
2730         }]
2731
2732 class StanfordOpenClassroomIE(InfoExtractor):
2733     """Information extractor for Stanford's Open ClassRoom"""
2734
2735     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2736     IE_NAME = u'stanfordoc'
2737
2738     def _real_extract(self, url):
2739         mobj = re.match(self._VALID_URL, url)
2740         if mobj is None:
2741             raise ExtractorError(u'Invalid URL: %s' % url)
2742
2743         if mobj.group('course') and mobj.group('video'): # A specific video
2744             course = mobj.group('course')
2745             video = mobj.group('video')
2746             info = {
2747                 'id': course + '_' + video,
2748                 'uploader': None,
2749                 'upload_date': None,
2750             }
2751
2752             self.report_extraction(info['id'])
2753             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2754             xmlUrl = baseUrl + video + '.xml'
2755             try:
2756                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2757             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2758                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2759             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2760             try:
2761                 info['title'] = mdoc.findall('./title')[0].text
2762                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2763             except IndexError:
2764                 raise ExtractorError(u'Invalid metadata XML file')
2765             info['ext'] = info['url'].rpartition('.')[2]
2766             return [info]
2767         elif mobj.group('course'): # A course page
2768             course = mobj.group('course')
2769             info = {
2770                 'id': course,
2771                 'type': 'playlist',
2772                 'uploader': None,
2773                 'upload_date': None,
2774             }
2775
2776             coursepage = self._download_webpage(url, info['id'],
2777                                         note='Downloading course info page',
2778                                         errnote='Unable to download course info page')
2779
2780             m = re.search('<h1>([^<]+)</h1>', coursepage)
2781             if m:
2782                 info['title'] = unescapeHTML(m.group(1))
2783             else:
2784                 info['title'] = info['id']
2785
2786             m = re.search('<description>([^<]+)</description>', coursepage)
2787             if m:
2788                 info['description'] = unescapeHTML(m.group(1))
2789
2790             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2791             info['list'] = [
2792                 {
2793                     'type': 'reference',
2794                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2795                 }
2796                     for vpage in links]
2797             results = []
2798             for entry in info['list']:
2799                 assert entry['type'] == 'reference'
2800                 results += self.extract(entry['url'])
2801             return results
2802         else: # Root page
2803             info = {
2804                 'id': 'Stanford OpenClassroom',
2805                 'type': 'playlist',
2806                 'uploader': None,
2807                 'upload_date': None,
2808             }
2809
2810             self.report_download_webpage(info['id'])
2811             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2812             try:
2813                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2814             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2815                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2816
2817             info['title'] = info['id']
2818
2819             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2820             info['list'] = [
2821                 {
2822                     'type': 'reference',
2823                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2824                 }
2825                     for cpage in links]
2826
2827             results = []
2828             for entry in info['list']:
2829                 assert entry['type'] == 'reference'
2830                 results += self.extract(entry['url'])
2831             return results
2832
2833 class MTVIE(InfoExtractor):
2834     """Information extractor for MTV.com"""
2835
2836     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2837     IE_NAME = u'mtv'
2838
2839     def _real_extract(self, url):
2840         mobj = re.match(self._VALID_URL, url)
2841         if mobj is None:
2842             raise ExtractorError(u'Invalid URL: %s' % url)
2843         if not mobj.group('proto'):
2844             url = 'http://' + url
2845         video_id = mobj.group('videoid')
2846
2847         webpage = self._download_webpage(url, video_id)
2848
2849         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2850         if mobj is None:
2851             raise ExtractorError(u'Unable to extract song name')
2852         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2853         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2854         if mobj is None:
2855             raise ExtractorError(u'Unable to extract performer')
2856         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2857         video_title = performer + ' - ' + song_name
2858
2859         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2860         if mobj is None:
2861             raise ExtractorError(u'Unable to mtvn_uri')
2862         mtvn_uri = mobj.group(1)
2863
2864         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2865         if mobj is None:
2866             raise ExtractorError(u'Unable to extract content id')
2867         content_id = mobj.group(1)
2868
2869         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2870         self.report_extraction(video_id)
2871         request = compat_urllib_request.Request(videogen_url)
2872         try:
2873             metadataXml = compat_urllib_request.urlopen(request).read()
2874         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2875             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2876
2877         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2878         renditions = mdoc.findall('.//rendition')
2879
2880         # For now, always pick the highest quality.
2881         rendition = renditions[-1]
2882
2883         try:
2884             _,_,ext = rendition.attrib['type'].partition('/')
2885             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2886             video_url = rendition.find('./src').text
2887         except KeyError:
2888             raise ExtractorError('Invalid rendition field.')
2889
2890         info = {
2891             'id': video_id,
2892             'url': video_url,
2893             'uploader': performer,
2894             'upload_date': None,
2895             'title': video_title,
2896             'ext': ext,
2897             'format': format,
2898         }
2899
2900         return [info]
2901
2902
2903 class YoukuIE(InfoExtractor):
2904     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2905
2906     def _gen_sid(self):
2907         nowTime = int(time.time() * 1000)
2908         random1 = random.randint(1000,1998)
2909         random2 = random.randint(1000,9999)
2910
2911         return "%d%d%d" %(nowTime,random1,random2)
2912
2913     def _get_file_ID_mix_string(self, seed):
2914         mixed = []
2915         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2916         seed = float(seed)
2917         for i in range(len(source)):
2918             seed  =  (seed * 211 + 30031 ) % 65536
2919             index  =  math.floor(seed / 65536 * len(source) )
2920             mixed.append(source[int(index)])
2921             source.remove(source[int(index)])
2922         #return ''.join(mixed)
2923         return mixed
2924
2925     def _get_file_id(self, fileId, seed):
2926         mixed = self._get_file_ID_mix_string(seed)
2927         ids = fileId.split('*')
2928         realId = []
2929         for ch in ids:
2930             if ch:
2931                 realId.append(mixed[int(ch)])
2932         return ''.join(realId)
2933
2934     def _real_extract(self, url):
2935         mobj = re.match(self._VALID_URL, url)
2936         if mobj is None:
2937             raise ExtractorError(u'Invalid URL: %s' % url)
2938         video_id = mobj.group('ID')
2939
2940         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2941
2942         jsondata = self._download_webpage(info_url, video_id)
2943
2944         self.report_extraction(video_id)
2945         try:
2946             config = json.loads(jsondata)
2947
2948             video_title =  config['data'][0]['title']
2949             seed = config['data'][0]['seed']
2950
2951             format = self._downloader.params.get('format', None)
2952             supported_format = list(config['data'][0]['streamfileids'].keys())
2953
2954             if format is None or format == 'best':
2955                 if 'hd2' in supported_format:
2956                     format = 'hd2'
2957                 else:
2958                     format = 'flv'
2959                 ext = u'flv'
2960             elif format == 'worst':
2961                 format = 'mp4'
2962                 ext = u'mp4'
2963             else:
2964                 format = 'flv'
2965                 ext = u'flv'
2966
2967
2968             fileid = config['data'][0]['streamfileids'][format]
2969             keys = [s['k'] for s in config['data'][0]['segs'][format]]
2970         except (UnicodeDecodeError, ValueError, KeyError):
2971             raise ExtractorError(u'Unable to extract info section')
2972
2973         files_info=[]
2974         sid = self._gen_sid()
2975         fileid = self._get_file_id(fileid, seed)
2976
2977         #column 8,9 of fileid represent the segment number
2978         #fileid[7:9] should be changed
2979         for index, key in enumerate(keys):
2980
2981             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2982             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2983
2984             info = {
2985                 'id': '%s_part%02d' % (video_id, index),
2986                 'url': download_url,
2987                 'uploader': None,
2988                 'upload_date': None,
2989                 'title': video_title,
2990                 'ext': ext,
2991             }
2992             files_info.append(info)
2993
2994         return files_info
2995
2996
2997 class XNXXIE(InfoExtractor):
2998     """Information extractor for xnxx.com"""
2999
3000     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3001     IE_NAME = u'xnxx'
3002     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3003     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3004     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3005
3006     def _real_extract(self, url):
3007         mobj = re.match(self._VALID_URL, url)
3008         if mobj is None:
3009             raise ExtractorError(u'Invalid URL: %s' % url)
3010         video_id = mobj.group(1)
3011
3012         # Get webpage content
3013         webpage = self._download_webpage(url, video_id)
3014
3015         result = re.search(self.VIDEO_URL_RE, webpage)
3016         if result is None:
3017             raise ExtractorError(u'Unable to extract video url')
3018         video_url = compat_urllib_parse.unquote(result.group(1))
3019
3020         result = re.search(self.VIDEO_TITLE_RE, webpage)
3021         if result is None:
3022             raise ExtractorError(u'Unable to extract video title')
3023         video_title = result.group(1)
3024
3025         result = re.search(self.VIDEO_THUMB_RE, webpage)
3026         if result is None:
3027             raise ExtractorError(u'Unable to extract video thumbnail')
3028         video_thumbnail = result.group(1)
3029
3030         return [{
3031             'id': video_id,
3032             'url': video_url,
3033             'uploader': None,
3034             'upload_date': None,
3035             'title': video_title,
3036             'ext': 'flv',
3037             'thumbnail': video_thumbnail,
3038             'description': None,
3039         }]
3040
3041
3042 class GooglePlusIE(InfoExtractor):
3043     """Information extractor for plus.google.com."""
3044
3045     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3046     IE_NAME = u'plus.google'
3047
3048     def report_extract_entry(self, url):
3049         """Report downloading extry"""
3050         self.to_screen(u'Downloading entry: %s' % url)
3051
3052     def report_date(self, upload_date):
3053         """Report downloading extry"""
3054         self.to_screen(u'Entry date: %s' % upload_date)
3055
3056     def report_uploader(self, uploader):
3057         """Report downloading extry"""
3058         self.to_screen(u'Uploader: %s' % uploader)
3059
3060     def report_title(self, video_title):
3061         """Report downloading extry"""
3062         self.to_screen(u'Title: %s' % video_title)
3063
3064     def report_extract_vid_page(self, video_page):
3065         """Report information extraction."""
3066         self.to_screen(u'Extracting video page: %s' % video_page)
3067
3068     def _real_extract(self, url):
3069         # Extract id from URL
3070         mobj = re.match(self._VALID_URL, url)
3071         if mobj is None:
3072             raise ExtractorError(u'Invalid URL: %s' % url)
3073
3074         post_url = mobj.group(0)
3075         video_id = mobj.group(1)
3076
3077         video_extension = 'flv'
3078
3079         # Step 1, Retrieve post webpage to extract further information
3080         self.report_extract_entry(post_url)
3081         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3082
3083         # Extract update date
3084         upload_date = None
3085         pattern = 'title="Timestamp">(.*?)</a>'
3086         mobj = re.search(pattern, webpage)
3087         if mobj:
3088             upload_date = mobj.group(1)
3089             # Convert timestring to a format suitable for filename
3090             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3091             upload_date = upload_date.strftime('%Y%m%d')
3092         self.report_date(upload_date)
3093
3094         # Extract uploader
3095         uploader = None
3096         pattern = r'rel\="author".*?>(.*?)</a>'
3097         mobj = re.search(pattern, webpage)
3098         if mobj:
3099             uploader = mobj.group(1)
3100         self.report_uploader(uploader)
3101
3102         # Extract title
3103         # Get the first line for title
3104         video_title = u'NA'
3105         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3106         mobj = re.search(pattern, webpage)
3107         if mobj:
3108             video_title = mobj.group(1)
3109         self.report_title(video_title)
3110
3111         # Step 2, Stimulate clicking the image box to launch video
3112         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3113         mobj = re.search(pattern, webpage)
3114         if mobj is None:
3115             self._downloader.report_error(u'unable to extract video page URL')
3116
3117         video_page = mobj.group(1)
3118         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3119         self.report_extract_vid_page(video_page)
3120
3121
3122         # Extract video links on video page
3123         """Extract video links of all sizes"""
3124         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3125         mobj = re.findall(pattern, webpage)
3126         if len(mobj) == 0:
3127             self._downloader.report_error(u'unable to extract video links')
3128
3129         # Sort in resolution
3130         links = sorted(mobj)
3131
3132         # Choose the lowest of the sort, i.e. highest resolution
3133         video_url = links[-1]
3134         # Only get the url. The resolution part in the tuple has no use anymore
3135         video_url = video_url[-1]
3136         # Treat escaped \u0026 style hex
3137         try:
3138             video_url = video_url.decode("unicode_escape")
3139         except AttributeError: # Python 3
3140             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3141
3142
3143         return [{
3144             'id':       video_id,
3145             'url':      video_url,
3146             'uploader': uploader,
3147             'upload_date':  upload_date,
3148             'title':    video_title,
3149             'ext':      video_extension,
3150         }]
3151
3152 class NBAIE(InfoExtractor):
3153     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3154     IE_NAME = u'nba'
3155
3156     def _real_extract(self, url):
3157         mobj = re.match(self._VALID_URL, url)
3158         if mobj is None:
3159             raise ExtractorError(u'Invalid URL: %s' % url)
3160
3161         video_id = mobj.group(1)
3162         if video_id.endswith('/index.html'):
3163             video_id = video_id[:-len('/index.html')]
3164
3165         webpage = self._download_webpage(url, video_id)
3166
3167         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3168         def _findProp(rexp, default=None):
3169             m = re.search(rexp, webpage)
3170             if m:
3171                 return unescapeHTML(m.group(1))
3172             else:
3173                 return default
3174
3175         shortened_video_id = video_id.rpartition('/')[2]
3176         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3177         info = {
3178             'id': shortened_video_id,
3179             'url': video_url,
3180             'ext': 'mp4',
3181             'title': title,
3182             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3183             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3184         }
3185         return [info]
3186
3187 class JustinTVIE(InfoExtractor):
3188     """Information extractor for justin.tv and twitch.tv"""
3189     # TODO: One broadcast may be split into multiple videos. The key
3190     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3191     # starts at 1 and increases. Can we treat all parts as one video?
3192
3193     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3194         (?:
3195             (?P<channelid>[^/]+)|
3196             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3197             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3198         )
3199         /?(?:\#.*)?$
3200         """
3201     _JUSTIN_PAGE_LIMIT = 100
3202     IE_NAME = u'justin.tv'
3203
3204     def report_download_page(self, channel, offset):
3205         """Report attempt to download a single page of videos."""
3206         self.to_screen(u'%s: Downloading video information from %d to %d' %
3207                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3208
3209     # Return count of items, list of *valid* items
3210     def _parse_page(self, url, video_id):
3211         webpage = self._download_webpage(url, video_id,
3212                                          u'Downloading video info JSON',
3213                                          u'unable to download video info JSON')
3214
3215         response = json.loads(webpage)
3216         if type(response) != list:
3217             error_text = response.get('error', 'unknown error')
3218             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3219         info = []
3220         for clip in response:
3221             video_url = clip['video_file_url']
3222             if video_url:
3223                 video_extension = os.path.splitext(video_url)[1][1:]
3224                 video_date = re.sub('-', '', clip['start_time'][:10])
3225                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3226                 video_id = clip['id']
3227                 video_title = clip.get('title', video_id)
3228                 info.append({
3229                     'id': video_id,
3230                     'url': video_url,
3231                     'title': video_title,
3232                     'uploader': clip.get('channel_name', video_uploader_id),
3233                     'uploader_id': video_uploader_id,
3234                     'upload_date': video_date,
3235                     'ext': video_extension,
3236                 })
3237         return (len(response), info)
3238
3239     def _real_extract(self, url):
3240         mobj = re.match(self._VALID_URL, url)
3241         if mobj is None:
3242             raise ExtractorError(u'invalid URL: %s' % url)
3243
3244         api_base = 'http://api.justin.tv'
3245         paged = False
3246         if mobj.group('channelid'):
3247             paged = True
3248             video_id = mobj.group('channelid')
3249             api = api_base + '/channel/archives/%s.json' % video_id
3250         elif mobj.group('chapterid'):
3251             chapter_id = mobj.group('chapterid')
3252
3253             webpage = self._download_webpage(url, chapter_id)
3254             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3255             if not m:
3256                 raise ExtractorError(u'Cannot find archive of a chapter')
3257             archive_id = m.group(1)
3258
3259             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3260             chapter_info_xml = self._download_webpage(api, chapter_id,
3261                                              note=u'Downloading chapter information',
3262                                              errnote=u'Chapter information download failed')
3263             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3264             for a in doc.findall('.//archive'):
3265                 if archive_id == a.find('./id').text:
3266                     break
3267             else:
3268                 raise ExtractorError(u'Could not find chapter in chapter information')
3269
3270             video_url = a.find('./video_file_url').text
3271             video_ext = video_url.rpartition('.')[2] or u'flv'
3272
3273             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3274             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3275                                    note='Downloading chapter metadata',
3276                                    errnote='Download of chapter metadata failed')
3277             chapter_info = json.loads(chapter_info_json)
3278
3279             bracket_start = int(doc.find('.//bracket_start').text)
3280             bracket_end = int(doc.find('.//bracket_end').text)
3281
3282             # TODO determine start (and probably fix up file)
3283             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3284             #video_url += u'?start=' + TODO:start_timestamp
3285             # bracket_start is 13290, but we want 51670615
3286             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3287                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3288
3289             info = {
3290                 'id': u'c' + chapter_id,
3291                 'url': video_url,
3292                 'ext': video_ext,
3293                 'title': chapter_info['title'],
3294                 'thumbnail': chapter_info['preview'],
3295                 'description': chapter_info['description'],
3296                 'uploader': chapter_info['channel']['display_name'],
3297                 'uploader_id': chapter_info['channel']['name'],
3298             }
3299             return [info]
3300         else:
3301             video_id = mobj.group('videoid')
3302             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3303
3304         self.report_extraction(video_id)
3305
3306         info = []
3307         offset = 0
3308         limit = self._JUSTIN_PAGE_LIMIT
3309         while True:
3310             if paged:
3311                 self.report_download_page(video_id, offset)
3312             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3313             page_count, page_info = self._parse_page(page_url, video_id)
3314             info.extend(page_info)
3315             if not paged or page_count != limit:
3316                 break
3317             offset += limit
3318         return info
3319
3320 class FunnyOrDieIE(InfoExtractor):
3321     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3322
3323     def _real_extract(self, url):
3324         mobj = re.match(self._VALID_URL, url)
3325         if mobj is None:
3326             raise ExtractorError(u'invalid URL: %s' % url)
3327
3328         video_id = mobj.group('id')
3329         webpage = self._download_webpage(url, video_id)
3330
3331         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3332         if not m:
3333             self._downloader.report_error(u'unable to find video information')
3334         video_url = unescapeHTML(m.group('url'))
3335
3336         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3337         if not m:
3338             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3339             if not m:
3340                 self._downloader.report_error(u'Cannot find video title')
3341         title = clean_html(m.group('title'))
3342
3343         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3344         if m:
3345             desc = unescapeHTML(m.group('desc'))
3346         else:
3347             desc = None
3348
3349         info = {
3350             'id': video_id,
3351             'url': video_url,
3352             'ext': 'mp4',
3353             'title': title,
3354             'description': desc,
3355         }
3356         return [info]
3357
3358 class SteamIE(InfoExtractor):
3359     _VALID_URL = r"""http://store\.steampowered\.com/
3360                 (agecheck/)?
3361                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3362                 (?P<gameID>\d+)/?
3363                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3364                 """
3365
3366     @classmethod
3367     def suitable(cls, url):
3368         """Receives a URL and returns True if suitable for this IE."""
3369         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3370
3371     def _real_extract(self, url):
3372         m = re.match(self._VALID_URL, url, re.VERBOSE)
3373         gameID = m.group('gameID')
3374         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3375         self.report_age_confirmation()
3376         webpage = self._download_webpage(videourl, gameID)
3377         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3378
3379         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3380         mweb = re.finditer(urlRE, webpage)
3381         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3382         titles = re.finditer(namesRE, webpage)
3383         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3384         thumbs = re.finditer(thumbsRE, webpage)
3385         videos = []
3386         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3387             video_id = vid.group('videoID')
3388             title = vtitle.group('videoName')
3389             video_url = vid.group('videoURL')
3390             video_thumb = thumb.group('thumbnail')
3391             if not video_url:
3392                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3393             info = {
3394                 'id':video_id,
3395                 'url':video_url,
3396                 'ext': 'flv',
3397                 'title': unescapeHTML(title),
3398                 'thumbnail': video_thumb
3399                   }
3400             videos.append(info)
3401         return [self.playlist_result(videos, gameID, game_title)]
3402
3403 class UstreamIE(InfoExtractor):
3404     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3405     IE_NAME = u'ustream'
3406
3407     def _real_extract(self, url):
3408         m = re.match(self._VALID_URL, url)
3409         video_id = m.group('videoID')
3410         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3411         webpage = self._download_webpage(url, video_id)
3412         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3413         title = m.group('title')
3414         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3415         uploader = m.group('uploader')
3416         info = {
3417                 'id':video_id,
3418                 'url':video_url,
3419                 'ext': 'flv',
3420                 'title': title,
3421                 'uploader': uploader
3422                   }
3423         return [info]
3424
3425 class WorldStarHipHopIE(InfoExtractor):
3426     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3427     IE_NAME = u'WorldStarHipHop'
3428
3429     def _real_extract(self, url):
3430         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3431
3432         m = re.match(self._VALID_URL, url)
3433         video_id = m.group('id')
3434
3435         webpage_src = self._download_webpage(url, video_id)
3436
3437         mobj = re.search(_src_url, webpage_src)
3438
3439         if mobj is not None:
3440             video_url = mobj.group(1)
3441             if 'mp4' in video_url:
3442                 ext = 'mp4'
3443             else:
3444                 ext = 'flv'
3445         else:
3446             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3447
3448         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3449
3450         if mobj is None:
3451             raise ExtractorError(u'Cannot determine title')
3452         title = mobj.group(1)
3453
3454         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3455         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3456         if mobj is not None:
3457             thumbnail = mobj.group(1)
3458         else:
3459             _title = r"""candytitles.*>(.*)</span>"""
3460             mobj = re.search(_title, webpage_src)
3461             if mobj is not None:
3462                 title = mobj.group(1)
3463             thumbnail = None
3464
3465         results = [{
3466                     'id': video_id,
3467                     'url' : video_url,
3468                     'title' : title,
3469                     'thumbnail' : thumbnail,
3470                     'ext' : ext,
3471                     }]
3472         return results
3473
3474 class RBMARadioIE(InfoExtractor):
3475     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3476
3477     def _real_extract(self, url):
3478         m = re.match(self._VALID_URL, url)
3479         video_id = m.group('videoID')
3480
3481         webpage = self._download_webpage(url, video_id)
3482         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3483         if not m:
3484             raise ExtractorError(u'Cannot find metadata')
3485         json_data = m.group(1)
3486
3487         try:
3488             data = json.loads(json_data)
3489         except ValueError as e:
3490             raise ExtractorError(u'Invalid JSON: ' + str(e))
3491
3492         video_url = data['akamai_url'] + '&cbr=256'
3493         url_parts = compat_urllib_parse_urlparse(video_url)
3494         video_ext = url_parts.path.rpartition('.')[2]
3495         info = {
3496                 'id': video_id,
3497                 'url': video_url,
3498                 'ext': video_ext,
3499                 'title': data['title'],
3500                 'description': data.get('teaser_text'),
3501                 'location': data.get('country_of_origin'),
3502                 'uploader': data.get('host', {}).get('name'),
3503                 'uploader_id': data.get('host', {}).get('slug'),
3504                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3505                 'duration': data.get('duration'),
3506         }
3507         return [info]
3508
3509
3510 class YouPornIE(InfoExtractor):
3511     """Information extractor for youporn.com."""
3512     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3513
3514     def _print_formats(self, formats):
3515         """Print all available formats"""
3516         print(u'Available formats:')
3517         print(u'ext\t\tformat')
3518         print(u'---------------------------------')
3519         for format in formats:
3520             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3521
3522     def _specific(self, req_format, formats):
3523         for x in formats:
3524             if(x["format"]==req_format):
3525                 return x
3526         return None
3527
3528     def _real_extract(self, url):
3529         mobj = re.match(self._VALID_URL, url)
3530         if mobj is None:
3531             raise ExtractorError(u'Invalid URL: %s' % url)
3532
3533         video_id = mobj.group('videoid')
3534
3535         req = compat_urllib_request.Request(url)
3536         req.add_header('Cookie', 'age_verified=1')
3537         webpage = self._download_webpage(req, video_id)
3538
3539         # Get the video title
3540         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3541         if result is None:
3542             raise ExtractorError(u'Unable to extract video title')
3543         video_title = result.group('title').strip()
3544
3545         # Get the video date
3546         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3547         if result is None:
3548             self._downloader.report_warning(u'unable to extract video date')
3549             upload_date = None
3550         else:
3551             upload_date = unified_strdate(result.group('date').strip())
3552
3553         # Get the video uploader
3554         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3555         if result is None:
3556             self._downloader.report_warning(u'unable to extract uploader')
3557             video_uploader = None
3558         else:
3559             video_uploader = result.group('uploader').strip()
3560             video_uploader = clean_html( video_uploader )
3561
3562         # Get all of the formats available
3563         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3564         result = re.search(DOWNLOAD_LIST_RE, webpage)
3565         if result is None:
3566             raise ExtractorError(u'Unable to extract download list')
3567         download_list_html = result.group('download_list').strip()
3568
3569         # Get all of the links from the page
3570         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3571         links = re.findall(LINK_RE, download_list_html)
3572         if(len(links) == 0):
3573             raise ExtractorError(u'ERROR: no known formats available for video')
3574
3575         self.to_screen(u'Links found: %d' % len(links))
3576
3577         formats = []
3578         for link in links:
3579
3580             # A link looks like this:
3581             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3582             # A path looks like this:
3583             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3584             video_url = unescapeHTML( link )
3585             path = compat_urllib_parse_urlparse( video_url ).path
3586             extension = os.path.splitext( path )[1][1:]
3587             format = path.split('/')[4].split('_')[:2]
3588             size = format[0]
3589             bitrate = format[1]
3590             format = "-".join( format )
3591             title = u'%s-%s-%s' % (video_title, size, bitrate)
3592
3593             formats.append({
3594                 'id': video_id,
3595                 'url': video_url,
3596                 'uploader': video_uploader,
3597                 'upload_date': upload_date,
3598                 'title': title,
3599                 'ext': extension,
3600                 'format': format,
3601                 'thumbnail': None,
3602                 'description': None,
3603                 'player_url': None
3604             })
3605
3606         if self._downloader.params.get('listformats', None):
3607             self._print_formats(formats)
3608             return
3609
3610         req_format = self._downloader.params.get('format', None)
3611         self.to_screen(u'Format: %s' % req_format)
3612
3613         if req_format is None or req_format == 'best':
3614             return [formats[0]]
3615         elif req_format == 'worst':
3616             return [formats[-1]]
3617         elif req_format in ('-1', 'all'):
3618             return formats
3619         else:
3620             format = self._specific( req_format, formats )
3621             if result is None:
3622                 raise ExtractorError(u'Requested format not available')
3623             return [format]
3624
3625
3626
3627 class PornotubeIE(InfoExtractor):
3628     """Information extractor for pornotube.com."""
3629     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3630
3631     def _real_extract(self, url):
3632         mobj = re.match(self._VALID_URL, url)
3633         if mobj is None:
3634             raise ExtractorError(u'Invalid URL: %s' % url)
3635
3636         video_id = mobj.group('videoid')
3637         video_title = mobj.group('title')
3638
3639         # Get webpage content
3640         webpage = self._download_webpage(url, video_id)
3641
3642         # Get the video URL
3643         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3644         result = re.search(VIDEO_URL_RE, webpage)
3645         if result is None:
3646             raise ExtractorError(u'Unable to extract video url')
3647         video_url = compat_urllib_parse.unquote(result.group('url'))
3648
3649         #Get the uploaded date
3650         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3651         result = re.search(VIDEO_UPLOADED_RE, webpage)
3652         if result is None:
3653             raise ExtractorError(u'Unable to extract video title')
3654         upload_date = unified_strdate(result.group('date'))
3655
3656         info = {'id': video_id,
3657                 'url': video_url,
3658                 'uploader': None,
3659                 'upload_date': upload_date,
3660                 'title': video_title,
3661                 'ext': 'flv',
3662                 'format': 'flv'}
3663
3664         return [info]
3665
3666 class YouJizzIE(InfoExtractor):
3667     """Information extractor for youjizz.com."""
3668     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3669
3670     def _real_extract(self, url):
3671         mobj = re.match(self._VALID_URL, url)
3672         if mobj is None:
3673             raise ExtractorError(u'Invalid URL: %s' % url)
3674
3675         video_id = mobj.group('videoid')
3676
3677         # Get webpage content
3678         webpage = self._download_webpage(url, video_id)
3679
3680         # Get the video title
3681         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3682         if result is None:
3683             raise ExtractorError(u'ERROR: unable to extract video title')
3684         video_title = result.group('title').strip()
3685
3686         # Get the embed page
3687         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3688         if result is None:
3689             raise ExtractorError(u'ERROR: unable to extract embed page')
3690
3691         embed_page_url = result.group(0).strip()
3692         video_id = result.group('videoid')
3693
3694         webpage = self._download_webpage(embed_page_url, video_id)
3695
3696         # Get the video URL
3697         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3698         if result is None:
3699             raise ExtractorError(u'ERROR: unable to extract video url')
3700         video_url = result.group('source')
3701
3702         info = {'id': video_id,
3703                 'url': video_url,
3704                 'title': video_title,
3705                 'ext': 'flv',
3706                 'format': 'flv',
3707                 'player_url': embed_page_url}
3708
3709         return [info]
3710
3711 class EightTracksIE(InfoExtractor):
3712     IE_NAME = '8tracks'
3713     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3714
3715     def _real_extract(self, url):
3716         mobj = re.match(self._VALID_URL, url)
3717         if mobj is None:
3718             raise ExtractorError(u'Invalid URL: %s' % url)
3719         playlist_id = mobj.group('id')
3720
3721         webpage = self._download_webpage(url, playlist_id)
3722
3723         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3724         if not m:
3725             raise ExtractorError(u'Cannot find trax information')
3726         json_like = m.group(1)
3727         data = json.loads(json_like)
3728
3729         session = str(random.randint(0, 1000000000))
3730         mix_id = data['id']
3731         track_count = data['tracks_count']
3732         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3733         next_url = first_url
3734         res = []
3735         for i in itertools.count():
3736             api_json = self._download_webpage(next_url, playlist_id,
3737                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3738                 errnote=u'Failed to download song information')
3739             api_data = json.loads(api_json)
3740             track_data = api_data[u'set']['track']
3741             info = {
3742                 'id': track_data['id'],
3743                 'url': track_data['track_file_stream_url'],
3744                 'title': track_data['performer'] + u' - ' + track_data['name'],
3745                 'raw_title': track_data['name'],
3746                 'uploader_id': data['user']['login'],
3747                 'ext': 'm4a',
3748             }
3749             res.append(info)
3750             if api_data['set']['at_last_track']:
3751                 break
3752             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3753         return res
3754
3755 class KeekIE(InfoExtractor):
3756     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3757     IE_NAME = u'keek'
3758
3759     def _real_extract(self, url):
3760         m = re.match(self._VALID_URL, url)
3761         video_id = m.group('videoID')
3762         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3763         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3764         webpage = self._download_webpage(url, video_id)
3765         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3766         title = unescapeHTML(m.group('title'))
3767         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3768         uploader = clean_html(m.group('uploader'))
3769         info = {
3770                 'id': video_id,
3771                 'url': video_url,
3772                 'ext': 'mp4',
3773                 'title': title,
3774                 'thumbnail': thumbnail,
3775                 'uploader': uploader
3776         }
3777         return [info]
3778
3779 class TEDIE(InfoExtractor):
3780     _VALID_URL=r'''http://www\.ted\.com/
3781                    (
3782                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3783                         |
3784                         ((?P<type_talk>talks)) # We have a simple talk
3785                    )
3786                    (/lang/(.*?))? # The url may contain the language
3787                    /(?P<name>\w+) # Here goes the name and then ".html"
3788                    '''
3789
3790     @classmethod
3791     def suitable(cls, url):
3792         """Receives a URL and returns True if suitable for this IE."""
3793         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3794
3795     def _real_extract(self, url):
3796         m=re.match(self._VALID_URL, url, re.VERBOSE)
3797         if m.group('type_talk'):
3798             return [self._talk_info(url)]
3799         else :
3800             playlist_id=m.group('playlist_id')
3801             name=m.group('name')
3802             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3803             return [self._playlist_videos_info(url,name,playlist_id)]
3804
3805     def _talk_video_link(self,mediaSlug):
3806         '''Returns the video link for that mediaSlug'''
3807         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3808
3809     def _playlist_videos_info(self,url,name,playlist_id=0):
3810         '''Returns the videos of the playlist'''
3811         video_RE=r'''
3812                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3813                      ([.\s]*?)data-playlist_item_id="(\d+)"
3814                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3815                      '''
3816         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3817         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3818         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3819         m_names=re.finditer(video_name_RE,webpage)
3820
3821         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3822         m_playlist = re.search(playlist_RE, webpage)
3823         playlist_title = m_playlist.group('playlist_title')
3824
3825         playlist_entries = []
3826         for m_video, m_name in zip(m_videos,m_names):
3827             video_id=m_video.group('video_id')
3828             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3829             playlist_entries.append(self.url_result(talk_url, 'TED'))
3830         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3831
3832     def _talk_info(self, url, video_id=0):
3833         """Return the video for the talk in the url"""
3834         m=re.match(self._VALID_URL, url,re.VERBOSE)
3835         videoName=m.group('name')
3836         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3837         # If the url includes the language we get the title translated
3838         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3839         title=re.search(title_RE, webpage).group('title')
3840         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3841                         "id":(?P<videoID>[\d]+).*?
3842                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3843         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3844         thumb_match=re.search(thumb_RE,webpage)
3845         info_match=re.search(info_RE,webpage,re.VERBOSE)
3846         video_id=info_match.group('videoID')
3847         mediaSlug=info_match.group('mediaSlug')
3848         video_url=self._talk_video_link(mediaSlug)
3849         info = {
3850                 'id': video_id,
3851                 'url': video_url,
3852                 'ext': 'mp4',
3853                 'title': title,
3854                 'thumbnail': thumb_match.group('thumbnail')
3855                 }
3856         return info
3857
3858 class MySpassIE(InfoExtractor):
3859     _VALID_URL = r'http://www.myspass.de/.*'
3860
3861     def _real_extract(self, url):
3862         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3863
3864         # video id is the last path element of the URL
3865         # usually there is a trailing slash, so also try the second but last
3866         url_path = compat_urllib_parse_urlparse(url).path
3867         url_parent_path, video_id = os.path.split(url_path)
3868         if not video_id:
3869             _, video_id = os.path.split(url_parent_path)
3870
3871         # get metadata
3872         metadata_url = META_DATA_URL_TEMPLATE % video_id
3873         metadata_text = self._download_webpage(metadata_url, video_id)
3874         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3875
3876         # extract values from metadata
3877         url_flv_el = metadata.find('url_flv')
3878         if url_flv_el is None:
3879             raise ExtractorError(u'Unable to extract download url')
3880         video_url = url_flv_el.text
3881         extension = os.path.splitext(video_url)[1][1:]
3882         title_el = metadata.find('title')
3883         if title_el is None:
3884             raise ExtractorError(u'Unable to extract title')
3885         title = title_el.text
3886         format_id_el = metadata.find('format_id')
3887         if format_id_el is None:
3888             format = ext
3889         else:
3890             format = format_id_el.text
3891         description_el = metadata.find('description')
3892         if description_el is not None:
3893             description = description_el.text
3894         else:
3895             description = None
3896         imagePreview_el = metadata.find('imagePreview')
3897         if imagePreview_el is not None:
3898             thumbnail = imagePreview_el.text
3899         else:
3900             thumbnail = None
3901         info = {
3902             'id': video_id,
3903             'url': video_url,
3904             'title': title,
3905             'ext': extension,
3906             'format': format,
3907             'thumbnail': thumbnail,
3908             'description': description
3909         }
3910         return [info]
3911
3912 class SpiegelIE(InfoExtractor):
3913     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3914
3915     def _real_extract(self, url):
3916         m = re.match(self._VALID_URL, url)
3917         video_id = m.group('videoID')
3918
3919         webpage = self._download_webpage(url, video_id)
3920         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3921         if not m:
3922             raise ExtractorError(u'Cannot find title')
3923         video_title = unescapeHTML(m.group(1))
3924
3925         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3926         xml_code = self._download_webpage(xml_url, video_id,
3927                     note=u'Downloading XML', errnote=u'Failed to download XML')
3928
3929         idoc = xml.etree.ElementTree.fromstring(xml_code)
3930         last_type = idoc[-1]
3931         filename = last_type.findall('./filename')[0].text
3932         duration = float(last_type.findall('./duration')[0].text)
3933
3934         video_url = 'http://video2.spiegel.de/flash/' + filename
3935         video_ext = filename.rpartition('.')[2]
3936         info = {
3937             'id': video_id,
3938             'url': video_url,
3939             'ext': video_ext,
3940             'title': video_title,
3941             'duration': duration,
3942         }
3943         return [info]
3944
3945 class LiveLeakIE(InfoExtractor):
3946
3947     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3948     IE_NAME = u'liveleak'
3949
3950     def _real_extract(self, url):
3951         mobj = re.match(self._VALID_URL, url)
3952         if mobj is None:
3953             raise ExtractorError(u'Invalid URL: %s' % url)
3954
3955         video_id = mobj.group('video_id')
3956
3957         webpage = self._download_webpage(url, video_id)
3958
3959         m = re.search(r'file: "(.*?)",', webpage)
3960         if not m:
3961             raise ExtractorError(u'Unable to find video url')
3962         video_url = m.group(1)
3963
3964         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3965         if not m:
3966             self._downloader.report_error(u'Cannot find video title')
3967         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3968
3969         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3970         if m:
3971             desc = unescapeHTML(m.group('desc'))
3972         else:
3973             desc = None
3974
3975         m = re.search(r'By:.*?(\w+)</a>', webpage)
3976         if m:
3977             uploader = clean_html(m.group(1))
3978         else:
3979             uploader = None
3980
3981         info = {
3982             'id':  video_id,
3983             'url': video_url,
3984             'ext': 'mp4',
3985             'title': title,
3986             'description': desc,
3987             'uploader': uploader
3988         }
3989
3990         return [info]
3991
3992 class ARDIE(InfoExtractor):
3993     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3994     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3995     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3996
3997     def _real_extract(self, url):
3998         # determine video id from url
3999         m = re.match(self._VALID_URL, url)
4000
4001         numid = re.search(r'documentId=([0-9]+)', url)
4002         if numid:
4003             video_id = numid.group(1)
4004         else:
4005             video_id = m.group('video_id')
4006
4007         # determine title and media streams from webpage
4008         html = self._download_webpage(url, video_id)
4009         title = re.search(self._TITLE, html).group('title')
4010         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4011         if not streams:
4012             assert '"fsk"' in html
4013             raise ExtractorError(u'This video is only available after 8:00 pm')
4014
4015         # choose default media type and highest quality for now
4016         stream = max([s for s in streams if int(s["media_type"]) == 0],
4017                      key=lambda s: int(s["quality"]))
4018
4019         # there's two possibilities: RTMP stream or HTTP download
4020         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4021         if stream['rtmp_url']:
4022             self.to_screen(u'RTMP download detected')
4023             assert stream['video_url'].startswith('mp4:')
4024             info["url"] = stream["rtmp_url"]
4025             info["play_path"] = stream['video_url']
4026         else:
4027             assert stream["video_url"].endswith('.mp4')
4028             info["url"] = stream["video_url"]
4029         return [info]
4030
4031 class TumblrIE(InfoExtractor):
4032     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4033
4034     def _real_extract(self, url):
4035         m_url = re.match(self._VALID_URL, url)
4036         video_id = m_url.group('id')
4037         blog = m_url.group('blog_name')
4038
4039         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4040         webpage = self._download_webpage(url, video_id)
4041
4042         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4043         video = re.search(re_video, webpage)
4044         if video is None:
4045             self.to_screen("No video founded")
4046             return []
4047         video_url = video.group('video_url')
4048         ext = video.group('ext')
4049
4050         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4051         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4052
4053         # The only place where you can get a title, it's not complete,
4054         # but searching in other places doesn't work for all videos
4055         re_title = r'<title>(?P<title>.*?)</title>'
4056         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4057
4058         return [{'id': video_id,
4059                  'url': video_url,
4060                  'title': title,
4061                  'thumbnail': thumb,
4062                  'ext': ext
4063                  }]
4064
4065 class BandcampIE(InfoExtractor):
4066     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4067
4068     def _real_extract(self, url):
4069         mobj = re.match(self._VALID_URL, url)
4070         title = mobj.group('title')
4071         webpage = self._download_webpage(url, title)
4072         # We get the link to the free download page
4073         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4074         if m_download is None:
4075             raise ExtractorError(u'No free songs founded')
4076
4077         download_link = m_download.group(1)
4078         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4079                        webpage, re.MULTILINE|re.DOTALL).group('id')
4080
4081         download_webpage = self._download_webpage(download_link, id,
4082                                                   'Downloading free downloads page')
4083         # We get the dictionary of the track from some javascrip code
4084         info = re.search(r'items: (.*?),$',
4085                          download_webpage, re.MULTILINE).group(1)
4086         info = json.loads(info)[0]
4087         # We pick mp3-320 for now, until format selection can be easily implemented.
4088         mp3_info = info[u'downloads'][u'mp3-320']
4089         # If we try to use this url it says the link has expired
4090         initial_url = mp3_info[u'url']
4091         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4092         m_url = re.match(re_url, initial_url)
4093         #We build the url we will use to get the final track url
4094         # This url is build in Bandcamp in the script download_bunde_*.js
4095         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4096         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4097         # If we could correctly generate the .rand field the url would be
4098         #in the "download_url" key
4099         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4100
4101         track_info = {'id':id,
4102                       'title' : info[u'title'],
4103                       'ext' : 'mp3',
4104                       'url' : final_url,
4105                       'thumbnail' : info[u'thumb_url'],
4106                       'uploader' : info[u'artist']
4107                       }
4108
4109         return [track_info]
4110
4111 class RedTubeIE(InfoExtractor):
4112     """Information Extractor for redtube"""
4113     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4114
4115     def _real_extract(self,url):
4116         mobj = re.match(self._VALID_URL, url)
4117         if mobj is None:
4118             raise ExtractorError(u'Invalid URL: %s' % url)
4119
4120         video_id = mobj.group('id')
4121         video_extension = 'mp4'
4122         webpage = self._download_webpage(url, video_id)
4123         self.report_extraction(video_id)
4124         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4125
4126         if mobj is None:
4127             raise ExtractorError(u'Unable to extract media URL')
4128
4129         video_url = mobj.group(1)
4130         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4131         if mobj is None:
4132             raise ExtractorError(u'Unable to extract title')
4133         video_title = mobj.group(1)
4134
4135         return [{
4136             'id':       video_id,
4137             'url':      video_url,
4138             'ext':      video_extension,
4139             'title':    video_title,
4140         }]
4141
4142
4143 def gen_extractors():
4144     """ Return a list of an instance of every supported extractor.
4145     The order does matter; the first extractor matched is the one handling the URL.
4146     """
4147     return [
4148         YoutubePlaylistIE(),
4149         YoutubeChannelIE(),
4150         YoutubeUserIE(),
4151         YoutubeSearchIE(),
4152         YoutubeIE(),
4153         MetacafeIE(),
4154         DailymotionIE(),
4155         GoogleSearchIE(),
4156         PhotobucketIE(),
4157         YahooIE(),
4158         YahooSearchIE(),
4159         DepositFilesIE(),
4160         FacebookIE(),
4161         BlipTVUserIE(),
4162         BlipTVIE(),
4163         VimeoIE(),
4164         MyVideoIE(),
4165         ComedyCentralIE(),
4166         EscapistIE(),
4167         CollegeHumorIE(),
4168         XVideosIE(),
4169         SoundcloudSetIE(),
4170         SoundcloudIE(),
4171         InfoQIE(),
4172         MixcloudIE(),
4173         StanfordOpenClassroomIE(),
4174         MTVIE(),
4175         YoukuIE(),
4176         XNXXIE(),
4177         YouJizzIE(),
4178         PornotubeIE(),
4179         YouPornIE(),
4180         GooglePlusIE(),
4181         ArteTvIE(),
4182         NBAIE(),
4183         WorldStarHipHopIE(),
4184         JustinTVIE(),
4185         FunnyOrDieIE(),
4186         SteamIE(),
4187         UstreamIE(),
4188         RBMARadioIE(),
4189         EightTracksIE(),
4190         KeekIE(),
4191         TEDIE(),
4192         MySpassIE(),
4193         SpiegelIE(),
4194         LiveLeakIE(),
4195         ARDIE(),
4196         TumblrIE(),
4197         BandcampIE(),
4198         RedTubeIE(),
4199         GenericIE()
4200     ]
4201
4202 def get_info_extractor(ie_name):
4203     """Returns the info extractor class with the given ie_name"""
4204     return globals()[ie_name+'IE']