youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             note = u'Downloading video webpage'
 118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 119         try:
 120             return compat_urllib_request.urlopen(url_or_request)
 121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 122             if errnote is None:
 123                 errnote = u'Unable to download webpage'
 124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 125
 126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 127         """ Returns the data of the page as a string """
 128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 129         content_type = urlh.headers.get('Content-Type', '')
 130         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 131         if m:
 132             encoding = m.group(1)
 133         else:
 134             encoding = 'utf-8'
 135         webpage_bytes = urlh.read()
 136         return webpage_bytes.decode(encoding, 'replace')
 137
 138     #Methods for following #608
 139     #They set the correct value of the '_type' key
 140     def video_result(self, video_info):
 141         """Returns a video"""
 142         video_info['_type'] = 'video'
 143         return video_info
 144     def url_result(self, url, ie=None):
 145         """Returns a url that points to a page that should be processed"""
 146         #TODO: ie should be the class used for getting the info
 147         video_info = {'_type': 'url',
 148                       'url': url}
 149         return video_info
 150     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 151         """Returns a playlist"""
 152         video_info = {'_type': 'playlist',
 153                       'entries': entries}
 154         if playlist_id:
 155             video_info['id'] = playlist_id
 156         if playlist_title:
 157             video_info['title'] = playlist_title
 158         return video_info
 159
 160
 161 class YoutubeIE(InfoExtractor):
 162     """Information extractor for youtube.com."""
 163
 164     _VALID_URL = r"""^
 165                      (
 166                          (?:https?://)?                                       # http(s):// (optional)
 167                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 168                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 169                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 170                          (?:                                                  # the various things that can precede the ID:
 171                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 172                              |(?:                                             # or the v= param in all its forms
 173                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 174                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 175                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 176                                  v=
 177                              )
 178                          )?                                                   # optional -> youtube.com/xxxx is OK
 179                      )?                                                       # all until now is optional -> you can pass the naked ID
 180                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 181                      (?(1).+)?                                                # if we found the ID, everything can follow
 182                      $"""
 183     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 184     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 185     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 186     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 187     _NETRC_MACHINE = 'youtube'
 188     # Listed in order of quality
 189     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 190     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 191     _video_extensions = {
 192         '13': '3gp',
 193         '17': 'mp4',
 194         '18': 'mp4',
 195         '22': 'mp4',
 196         '37': 'mp4',
 197         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 198         '43': 'webm',
 199         '44': 'webm',
 200         '45': 'webm',
 201         '46': 'webm',
 202     }
 203     _video_dimensions = {
 204         '5': '240x400',
 205         '6': '???',
 206         '13': '???',
 207         '17': '144x176',
 208         '18': '360x640',
 209         '22': '720x1280',
 210         '34': '360x640',
 211         '35': '480x854',
 212         '37': '1080x1920',
 213         '38': '3072x4096',
 214         '43': '360x640',
 215         '44': '480x854',
 216         '45': '720x1280',
 217         '46': '1080x1920',
 218     }
 219     IE_NAME = u'youtube'
 220
 221     @classmethod
 222     def suitable(cls, url):
 223         """Receives a URL and returns True if suitable for this IE."""
 224         if YoutubePlaylistIE.suitable(url): return False
 225         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 226
 227     def report_lang(self):
 228         """Report attempt to set language."""
 229         self._downloader.to_screen(u'[youtube] Setting language')
 230
 231     def report_login(self):
 232         """Report attempt to log in."""
 233         self._downloader.to_screen(u'[youtube] Logging in')
 234
 235     def report_age_confirmation(self):
 236         """Report attempt to confirm age."""
 237         self._downloader.to_screen(u'[youtube] Confirming age')
 238
 239     def report_video_webpage_download(self, video_id):
 240         """Report attempt to download video webpage."""
 241         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 242
 243     def report_video_info_webpage_download(self, video_id):
 244         """Report attempt to download video info webpage."""
 245         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 246
 247     def report_video_subtitles_download(self, video_id):
 248         """Report attempt to download video info webpage."""
 249         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
 250
 251     def report_video_subtitles_request(self, video_id, sub_lang, format):
 252         """Report attempt to download video info webpage."""
 253         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 254
 255     def report_video_subtitles_available(self, video_id, sub_lang_list):
 256         """Report available subtitles."""
 257         sub_lang = ",".join(list(sub_lang_list.keys()))
 258         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
 259
 260     def report_information_extraction(self, video_id):
 261         """Report attempt to extract video information."""
 262         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 263
 264     def report_unavailable_format(self, video_id, format):
 265         """Report extracted video URL."""
 266         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 267
 268     def report_rtmp_download(self):
 269         """Indicate the download will use the RTMP protocol."""
 270         self._downloader.to_screen(u'[youtube] RTMP download detected')
 271
 272     def _get_available_subtitles(self, video_id):
 273         self.report_video_subtitles_download(video_id)
 274         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 275         try:
 276             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 277         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 278             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 279         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 280         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 281         if not sub_lang_list:
 282             return (u'WARNING: video doesn\'t have subtitles', None)
 283         return sub_lang_list
 284
 285     def _list_available_subtitles(self, video_id):
 286         sub_lang_list = self._get_available_subtitles(video_id)
 287         self.report_video_subtitles_available(video_id, sub_lang_list)
 288
 289     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 290         self.report_video_subtitles_request(video_id, sub_lang, format)
 291         params = compat_urllib_parse.urlencode({
 292             'lang': sub_lang,
 293             'name': sub_name,
 294             'v': video_id,
 295             'fmt': format,
 296         })
 297         url = 'http://www.youtube.com/api/timedtext?' + params
 298         try:
 299             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 300         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 301             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 302         if not sub:
 303             return (u'WARNING: Did not fetch video subtitles', None)
 304         return (None, sub_lang, sub)
 305
 306     def _extract_subtitle(self, video_id):
 307         sub_lang_list = self._get_available_subtitles(video_id)
 308         sub_format = self._downloader.params.get('subtitlesformat')
 309         if self._downloader.params.get('subtitleslang', False):
 310             sub_lang = self._downloader.params.get('subtitleslang')
 311         elif 'en' in sub_lang_list:
 312             sub_lang = 'en'
 313         else:
 314             sub_lang = list(sub_lang_list.keys())[0]
 315         if not sub_lang in sub_lang_list:
 316             return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None)
 317
 318         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 319         return [subtitle]
 320
 321     def _extract_all_subtitles(self, video_id):
 322         sub_lang_list = self._get_available_subtitles(video_id)
 323         sub_format = self._downloader.params.get('subtitlesformat')
 324         subtitles = []
 325         for sub_lang in sub_lang_list:
 326             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 327             subtitles.append(subtitle)
 328         return subtitles
 329
 330     def _print_formats(self, formats):
 331         print('Available formats:')
 332         for x in formats:
 333             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 334
 335     def _real_initialize(self):
 336         if self._downloader is None:
 337             return
 338
 339         username = None
 340         password = None
 341         downloader_params = self._downloader.params
 342
 343         # Attempt to use provided username and password or .netrc data
 344         if downloader_params.get('username', None) is not None:
 345             username = downloader_params['username']
 346             password = downloader_params['password']
 347         elif downloader_params.get('usenetrc', False):
 348             try:
 349                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 350                 if info is not None:
 351                     username = info[0]
 352                     password = info[2]
 353                 else:
 354                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 355             except (IOError, netrc.NetrcParseError) as err:
 356                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 357                 return
 358
 359         # Set language
 360         request = compat_urllib_request.Request(self._LANG_URL)
 361         try:
 362             self.report_lang()
 363             compat_urllib_request.urlopen(request).read()
 364         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 365             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 366             return
 367
 368         # No authentication to be performed
 369         if username is None:
 370             return
 371
 372         request = compat_urllib_request.Request(self._LOGIN_URL)
 373         try:
 374             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 375         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 376             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 377             return
 378
 379         galx = None
 380         dsh = None
 381         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 382         if match:
 383           galx = match.group(1)
 384
 385         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 386         if match:
 387           dsh = match.group(1)
 388
 389         # Log in
 390         login_form_strs = {
 391                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 392                 u'Email': username,
 393                 u'GALX': galx,
 394                 u'Passwd': password,
 395                 u'PersistentCookie': u'yes',
 396                 u'_utf8': u'霱',
 397                 u'bgresponse': u'js_disabled',
 398                 u'checkConnection': u'',
 399                 u'checkedDomains': u'youtube',
 400                 u'dnConn': u'',
 401                 u'dsh': dsh,
 402                 u'pstMsg': u'0',
 403                 u'rmShown': u'1',
 404                 u'secTok': u'',
 405                 u'signIn': u'Sign in',
 406                 u'timeStmp': u'',
 407                 u'service': u'youtube',
 408                 u'uilel': u'3',
 409                 u'hl': u'en_US',
 410         }
 411         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 412         # chokes on unicode
 413         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 414         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 415         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 416         try:
 417             self.report_login()
 418             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 419             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 420                 self._downloader.report_warning(u'unable to log in: bad username or password')
 421                 return
 422         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 423             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 424             return
 425
 426         # Confirm age
 427         age_form = {
 428                 'next_url':     '/',
 429                 'action_confirm':   'Confirm',
 430                 }
 431         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 432         try:
 433             self.report_age_confirmation()
 434             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 435         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 436             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 437             return
 438
 439     def _extract_id(self, url):
 440         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 441         if mobj is None:
 442             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 443             return
 444         video_id = mobj.group(2)
 445         return video_id
 446
 447     def _real_extract(self, url):
 448         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 449         mobj = re.search(self._NEXT_URL_RE, url)
 450         if mobj:
 451             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 452         video_id = self._extract_id(url)
 453
 454         # Get video webpage
 455         self.report_video_webpage_download(video_id)
 456         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 457         request = compat_urllib_request.Request(url)
 458         try:
 459             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 460         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 461             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 462             return
 463
 464         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 465
 466         # Attempt to extract SWF player URL
 467         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 468         if mobj is not None:
 469             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 470         else:
 471             player_url = None
 472
 473         # Get video info
 474         self.report_video_info_webpage_download(video_id)
 475         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 476             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 477                     % (video_id, el_type))
 478             request = compat_urllib_request.Request(video_info_url)
 479             try:
 480                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 481                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 482                 video_info = compat_parse_qs(video_info_webpage)
 483                 if 'token' in video_info:
 484                     break
 485             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 486                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 487                 return
 488         if 'token' not in video_info:
 489             if 'reason' in video_info:
 490                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 491             else:
 492                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 493             return
 494
 495         # Check for "rental" videos
 496         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 497             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 498             return
 499
 500         # Start extracting information
 501         self.report_information_extraction(video_id)
 502
 503         # uploader
 504         if 'author' not in video_info:
 505             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 506             return
 507         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 508
 509         # uploader_id
 510         video_uploader_id = None
 511         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 512         if mobj is not None:
 513             video_uploader_id = mobj.group(1)
 514         else:
 515             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 516
 517         # title
 518         if 'title' not in video_info:
 519             self._downloader.trouble(u'ERROR: unable to extract video title')
 520             return
 521         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 522
 523         # thumbnail image
 524         if 'thumbnail_url' not in video_info:
 525             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 526             video_thumbnail = ''
 527         else:   # don't panic if we can't find it
 528             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 529
 530         # upload date
 531         upload_date = None
 532         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 533         if mobj is not None:
 534             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 535             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 536             for expression in format_expressions:
 537                 try:
 538                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 539                 except:
 540                     pass
 541
 542         # description
 543         video_description = get_element_by_id("eow-description", video_webpage)
 544         if video_description:
 545             video_description = clean_html(video_description)
 546         else:
 547             video_description = ''
 548
 549         # subtitles
 550         video_subtitles = None
 551
 552         if self._downloader.params.get('writesubtitles', False):
 553             video_subtitles = self._extract_subtitle(video_id)
 554             if video_subtitles:
 555                 (sub_error, sub_lang, sub) = video_subtitles[0]
 556                 if sub_error:
 557                     self._downloader.trouble(sub_error)
 558
 559         if self._downloader.params.get('allsubtitles', False):
 560             video_subtitles = self._extract_all_subtitles(video_id)
 561             for video_subtitle in video_subtitles:
 562                 (sub_error, sub_lang, sub) = video_subtitle
 563                 if sub_error:
 564                     self._downloader.trouble(sub_error)
 565
 566         if self._downloader.params.get('listsubtitles', False):
 567             sub_lang_list = self._list_available_subtitles(video_id)
 568             return
 569
 570         if 'length_seconds' not in video_info:
 571             self._downloader.trouble(u'WARNING: unable to extract video duration')
 572             video_duration = ''
 573         else:
 574             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 575
 576         # token
 577         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 578
 579         # Decide which formats to download
 580         req_format = self._downloader.params.get('format', None)
 581
 582         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 583             self.report_rtmp_download()
 584             video_url_list = [(None, video_info['conn'][0])]
 585         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 586             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 587             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 588             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 589             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 590
 591             format_limit = self._downloader.params.get('format_limit', None)
 592             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 593             if format_limit is not None and format_limit in available_formats:
 594                 format_list = available_formats[available_formats.index(format_limit):]
 595             else:
 596                 format_list = available_formats
 597             existing_formats = [x for x in format_list if x in url_map]
 598             if len(existing_formats) == 0:
 599                 self._downloader.trouble(u'ERROR: no known formats available for video')
 600                 return
 601             if self._downloader.params.get('listformats', None):
 602                 self._print_formats(existing_formats)
 603                 return
 604             if req_format is None or req_format == 'best':
 605                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 606             elif req_format == 'worst':
 607                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 608             elif req_format in ('-1', 'all'):
 609                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 610             else:
 611                 # Specific formats. We pick the first in a slash-delimeted sequence.
 612                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 613                 req_formats = req_format.split('/')
 614                 video_url_list = None
 615                 for rf in req_formats:
 616                     if rf in url_map:
 617                         video_url_list = [(rf, url_map[rf])]
 618                         break
 619                 if video_url_list is None:
 620                     self._downloader.trouble(u'ERROR: requested format not available')
 621                     return
 622         else:
 623             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 624             return
 625
 626         results = []
 627         for format_param, video_real_url in video_url_list:
 628             # Extension
 629             video_extension = self._video_extensions.get(format_param, 'flv')
 630
 631             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 632                                               self._video_dimensions.get(format_param, '???'))
 633
 634             results.append({
 635                 'id':       video_id,
 636                 'url':      video_real_url,
 637                 'uploader': video_uploader,
 638                 'uploader_id': video_uploader_id,
 639                 'upload_date':  upload_date,
 640                 'title':    video_title,
 641                 'ext':      video_extension,
 642                 'format':   video_format,
 643                 'thumbnail':    video_thumbnail,
 644                 'description':  video_description,
 645                 'player_url':   player_url,
 646                 'subtitles':    video_subtitles,
 647                 'duration':     video_duration
 648             })
 649         return results
 650
 651
 652 class MetacafeIE(InfoExtractor):
 653     """Information Extractor for metacafe.com."""
 654
 655     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 656     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 657     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 658     IE_NAME = u'metacafe'
 659
 660     def __init__(self, downloader=None):
 661         InfoExtractor.__init__(self, downloader)
 662
 663     def report_disclaimer(self):
 664         """Report disclaimer retrieval."""
 665         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 666
 667     def report_age_confirmation(self):
 668         """Report attempt to confirm age."""
 669         self._downloader.to_screen(u'[metacafe] Confirming age')
 670
 671     def report_download_webpage(self, video_id):
 672         """Report webpage download."""
 673         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 674
 675     def report_extraction(self, video_id):
 676         """Report information extraction."""
 677         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 678
 679     def _real_initialize(self):
 680         # Retrieve disclaimer
 681         request = compat_urllib_request.Request(self._DISCLAIMER)
 682         try:
 683             self.report_disclaimer()
 684             disclaimer = compat_urllib_request.urlopen(request).read()
 685         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 686             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 687             return
 688
 689         # Confirm age
 690         disclaimer_form = {
 691             'filters': '0',
 692             'submit': "Continue - I'm over 18",
 693             }
 694         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 695         try:
 696             self.report_age_confirmation()
 697             disclaimer = compat_urllib_request.urlopen(request).read()
 698         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 699             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 700             return
 701
 702     def _real_extract(self, url):
 703         # Extract id and simplified title from URL
 704         mobj = re.match(self._VALID_URL, url)
 705         if mobj is None:
 706             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 707             return
 708
 709         video_id = mobj.group(1)
 710
 711         # Check if video comes from YouTube
 712         mobj2 = re.match(r'^yt-(.*)$', video_id)
 713         if mobj2 is not None:
 714             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 715             return
 716
 717         # Retrieve video webpage to extract further information
 718         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 719         try:
 720             self.report_download_webpage(video_id)
 721             webpage = compat_urllib_request.urlopen(request).read()
 722         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 723             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 724             return
 725
 726         # Extract URL, uploader and title from webpage
 727         self.report_extraction(video_id)
 728         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 729         if mobj is not None:
 730             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 731             video_extension = mediaURL[-3:]
 732
 733             # Extract gdaKey if available
 734             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 735             if mobj is None:
 736                 video_url = mediaURL
 737             else:
 738                 gdaKey = mobj.group(1)
 739                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 740         else:
 741             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 742             if mobj is None:
 743                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 744                 return
 745             vardict = compat_parse_qs(mobj.group(1))
 746             if 'mediaData' not in vardict:
 747                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 748                 return
 749             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 750             if mobj is None:
 751                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 752                 return
 753             mediaURL = mobj.group(1).replace('\\/', '/')
 754             video_extension = mediaURL[-3:]
 755             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 756
 757         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 758         if mobj is None:
 759             self._downloader.trouble(u'ERROR: unable to extract title')
 760             return
 761         video_title = mobj.group(1).decode('utf-8')
 762
 763         mobj = re.search(r'submitter=(.*?);', webpage)
 764         if mobj is None:
 765             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 766             return
 767         video_uploader = mobj.group(1)
 768
 769         return [{
 770             'id':       video_id.decode('utf-8'),
 771             'url':      video_url.decode('utf-8'),
 772             'uploader': video_uploader.decode('utf-8'),
 773             'upload_date':  None,
 774             'title':    video_title,
 775             'ext':      video_extension.decode('utf-8'),
 776         }]
 777
 778
 779 class DailymotionIE(InfoExtractor):
 780     """Information Extractor for Dailymotion"""
 781
 782     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 783     IE_NAME = u'dailymotion'
 784     _WORKING = False
 785
 786     def __init__(self, downloader=None):
 787         InfoExtractor.__init__(self, downloader)
 788
 789     def report_extraction(self, video_id):
 790         """Report information extraction."""
 791         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 792
 793     def _real_extract(self, url):
 794         # Extract id and simplified title from URL
 795         mobj = re.match(self._VALID_URL, url)
 796         if mobj is None:
 797             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 798             return
 799
 800         video_id = mobj.group(1).split('_')[0].split('?')[0]
 801
 802         video_extension = 'mp4'
 803
 804         # Retrieve video webpage to extract further information
 805         request = compat_urllib_request.Request(url)
 806         request.add_header('Cookie', 'family_filter=off')
 807         webpage = self._download_webpage(request, video_id)
 808
 809         # Extract URL, uploader and title from webpage
 810         self.report_extraction(video_id)
 811         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 812         if mobj is None:
 813             self._downloader.trouble(u'ERROR: unable to extract media URL')
 814             return
 815         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 816
 817         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 818             if key in flashvars:
 819                 max_quality = key
 820                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 821                 break
 822         else:
 823             self._downloader.trouble(u'ERROR: unable to extract video URL')
 824             return
 825
 826         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 827         if mobj is None:
 828             self._downloader.trouble(u'ERROR: unable to extract video URL')
 829             return
 830
 831         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 832
 833         # TODO: support choosing qualities
 834
 835         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 836         if mobj is None:
 837             self._downloader.trouble(u'ERROR: unable to extract title')
 838             return
 839         video_title = unescapeHTML(mobj.group('title'))
 840
 841         video_uploader = None
 842         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 843         if mobj is None:
 844             # lookin for official user
 845             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 846             if mobj_official is None:
 847                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 848             else:
 849                 video_uploader = mobj_official.group(1)
 850         else:
 851             video_uploader = mobj.group(1)
 852
 853         video_upload_date = None
 854         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 855         if mobj is not None:
 856             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 857
 858         return [{
 859             'id':       video_id,
 860             'url':      video_url,
 861             'uploader': video_uploader,
 862             'upload_date':  video_upload_date,
 863             'title':    video_title,
 864             'ext':      video_extension,
 865         }]
 866
 867
 868 class PhotobucketIE(InfoExtractor):
 869     """Information extractor for photobucket.com."""
 870
 871     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 872     IE_NAME = u'photobucket'
 873
 874     def __init__(self, downloader=None):
 875         InfoExtractor.__init__(self, downloader)
 876
 877     def report_download_webpage(self, video_id):
 878         """Report webpage download."""
 879         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 880
 881     def report_extraction(self, video_id):
 882         """Report information extraction."""
 883         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 884
 885     def _real_extract(self, url):
 886         # Extract id from URL
 887         mobj = re.match(self._VALID_URL, url)
 888         if mobj is None:
 889             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 890             return
 891
 892         video_id = mobj.group(1)
 893
 894         video_extension = 'flv'
 895
 896         # Retrieve video webpage to extract further information
 897         request = compat_urllib_request.Request(url)
 898         try:
 899             self.report_download_webpage(video_id)
 900             webpage = compat_urllib_request.urlopen(request).read()
 901         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 902             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 903             return
 904
 905         # Extract URL, uploader, and title from webpage
 906         self.report_extraction(video_id)
 907         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 908         if mobj is None:
 909             self._downloader.trouble(u'ERROR: unable to extract media URL')
 910             return
 911         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 912
 913         video_url = mediaURL
 914
 915         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 916         if mobj is None:
 917             self._downloader.trouble(u'ERROR: unable to extract title')
 918             return
 919         video_title = mobj.group(1).decode('utf-8')
 920
 921         video_uploader = mobj.group(2).decode('utf-8')
 922
 923         return [{
 924             'id':       video_id.decode('utf-8'),
 925             'url':      video_url.decode('utf-8'),
 926             'uploader': video_uploader,
 927             'upload_date':  None,
 928             'title':    video_title,
 929             'ext':      video_extension.decode('utf-8'),
 930         }]
 931
 932
 933 class YahooIE(InfoExtractor):
 934     """Information extractor for video.yahoo.com."""
 935
 936     _WORKING = False
 937     # _VALID_URL matches all Yahoo! Video URLs
 938     # _VPAGE_URL matches only the extractable '/watch/' URLs
 939     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 940     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 941     IE_NAME = u'video.yahoo'
 942
 943     def __init__(self, downloader=None):
 944         InfoExtractor.__init__(self, downloader)
 945
 946     def report_download_webpage(self, video_id):
 947         """Report webpage download."""
 948         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 949
 950     def report_extraction(self, video_id):
 951         """Report information extraction."""
 952         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 953
 954     def _real_extract(self, url, new_video=True):
 955         # Extract ID from URL
 956         mobj = re.match(self._VALID_URL, url)
 957         if mobj is None:
 958             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 959             return
 960
 961         video_id = mobj.group(2)
 962         video_extension = 'flv'
 963
 964         # Rewrite valid but non-extractable URLs as
 965         # extractable English language /watch/ URLs
 966         if re.match(self._VPAGE_URL, url) is None:
 967             request = compat_urllib_request.Request(url)
 968             try:
 969                 webpage = compat_urllib_request.urlopen(request).read()
 970             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 971                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 972                 return
 973
 974             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 975             if mobj is None:
 976                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 977                 return
 978             yahoo_id = mobj.group(1)
 979
 980             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 981             if mobj is None:
 982                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 983                 return
 984             yahoo_vid = mobj.group(1)
 985
 986             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 987             return self._real_extract(url, new_video=False)
 988
 989         # Retrieve video webpage to extract further information
 990         request = compat_urllib_request.Request(url)
 991         try:
 992             self.report_download_webpage(video_id)
 993             webpage = compat_urllib_request.urlopen(request).read()
 994         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 995             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 996             return
 997
 998         # Extract uploader and title from webpage
 999         self.report_extraction(video_id)
1000         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1001         if mobj is None:
1002             self._downloader.trouble(u'ERROR: unable to extract video title')
1003             return
1004         video_title = mobj.group(1).decode('utf-8')
1005
1006         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1007         if mobj is None:
1008             self._downloader.trouble(u'ERROR: unable to extract video uploader')
1009             return
1010         video_uploader = mobj.group(1).decode('utf-8')
1011
1012         # Extract video thumbnail
1013         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1014         if mobj is None:
1015             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1016             return
1017         video_thumbnail = mobj.group(1).decode('utf-8')
1018
1019         # Extract video description
1020         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1021         if mobj is None:
1022             self._downloader.trouble(u'ERROR: unable to extract video description')
1023             return
1024         video_description = mobj.group(1).decode('utf-8')
1025         if not video_description:
1026             video_description = 'No description available.'
1027
1028         # Extract video height and width
1029         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1030         if mobj is None:
1031             self._downloader.trouble(u'ERROR: unable to extract video height')
1032             return
1033         yv_video_height = mobj.group(1)
1034
1035         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1036         if mobj is None:
1037             self._downloader.trouble(u'ERROR: unable to extract video width')
1038             return
1039         yv_video_width = mobj.group(1)
1040
1041         # Retrieve video playlist to extract media URL
1042         # I'm not completely sure what all these options are, but we
1043         # seem to need most of them, otherwise the server sends a 401.
1044         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1045         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1046         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1047                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1048                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1049         try:
1050             self.report_download_webpage(video_id)
1051             webpage = compat_urllib_request.urlopen(request).read()
1052         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1053             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1054             return
1055
1056         # Extract media URL from playlist XML
1057         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1058         if mobj is None:
1059             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1060             return
1061         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1062         video_url = unescapeHTML(video_url)
1063
1064         return [{
1065             'id':       video_id.decode('utf-8'),
1066             'url':      video_url,
1067             'uploader': video_uploader,
1068             'upload_date':  None,
1069             'title':    video_title,
1070             'ext':      video_extension.decode('utf-8'),
1071             'thumbnail':    video_thumbnail.decode('utf-8'),
1072             'description':  video_description,
1073         }]
1074
1075
1076 class VimeoIE(InfoExtractor):
1077     """Information extractor for vimeo.com."""
1078
1079     # _VALID_URL matches Vimeo URLs
1080     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1081     IE_NAME = u'vimeo'
1082
1083     def __init__(self, downloader=None):
1084         InfoExtractor.__init__(self, downloader)
1085
1086     def report_download_webpage(self, video_id):
1087         """Report webpage download."""
1088         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1089
1090     def report_extraction(self, video_id):
1091         """Report information extraction."""
1092         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1093
1094     def _real_extract(self, url, new_video=True):
1095         # Extract ID from URL
1096         mobj = re.match(self._VALID_URL, url)
1097         if mobj is None:
1098             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1099             return
1100
1101         video_id = mobj.group('id')
1102         if not mobj.group('proto'):
1103             url = 'https://' + url
1104         if mobj.group('direct_link'):
1105             url = 'https://vimeo.com/' + video_id
1106
1107         # Retrieve video webpage to extract further information
1108         request = compat_urllib_request.Request(url, None, std_headers)
1109         try:
1110             self.report_download_webpage(video_id)
1111             webpage_bytes = compat_urllib_request.urlopen(request).read()
1112             webpage = webpage_bytes.decode('utf-8')
1113         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1114             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1115             return
1116
1117         # Now we begin extracting as much information as we can from what we
1118         # retrieved. First we extract the information common to all extractors,
1119         # and latter we extract those that are Vimeo specific.
1120         self.report_extraction(video_id)
1121
1122         # Extract the config JSON
1123         try:
1124             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1125             config = json.loads(config)
1126         except:
1127             self._downloader.trouble(u'ERROR: unable to extract info section')
1128             return
1129
1130         # Extract title
1131         video_title = config["video"]["title"]
1132
1133         # Extract uploader and uploader_id
1134         video_uploader = config["video"]["owner"]["name"]
1135         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1136
1137         # Extract video thumbnail
1138         video_thumbnail = config["video"]["thumbnail"]
1139
1140         # Extract video description
1141         video_description = get_element_by_attribute("itemprop", "description", webpage)
1142         if video_description: video_description = clean_html(video_description)
1143         else: video_description = ''
1144
1145         # Extract upload date
1146         video_upload_date = None
1147         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1148         if mobj is not None:
1149             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1150
1151         # Vimeo specific: extract request signature and timestamp
1152         sig = config['request']['signature']
1153         timestamp = config['request']['timestamp']
1154
1155         # Vimeo specific: extract video codec and quality information
1156         # First consider quality, then codecs, then take everything
1157         # TODO bind to format param
1158         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1159         files = { 'hd': [], 'sd': [], 'other': []}
1160         for codec_name, codec_extension in codecs:
1161             if codec_name in config["video"]["files"]:
1162                 if 'hd' in config["video"]["files"][codec_name]:
1163                     files['hd'].append((codec_name, codec_extension, 'hd'))
1164                 elif 'sd' in config["video"]["files"][codec_name]:
1165                     files['sd'].append((codec_name, codec_extension, 'sd'))
1166                 else:
1167                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1168
1169         for quality in ('hd', 'sd', 'other'):
1170             if len(files[quality]) > 0:
1171                 video_quality = files[quality][0][2]
1172                 video_codec = files[quality][0][0]
1173                 video_extension = files[quality][0][1]
1174                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1175                 break
1176         else:
1177             self._downloader.trouble(u'ERROR: no known codec found')
1178             return
1179
1180         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1181                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1182
1183         return [{
1184             'id':       video_id,
1185             'url':      video_url,
1186             'uploader': video_uploader,
1187             'uploader_id': video_uploader_id,
1188             'upload_date':  video_upload_date,
1189             'title':    video_title,
1190             'ext':      video_extension,
1191             'thumbnail':    video_thumbnail,
1192             'description':  video_description,
1193         }]
1194
1195
1196 class ArteTvIE(InfoExtractor):
1197     """arte.tv information extractor."""
1198
1199     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1200     _LIVE_URL = r'index-[0-9]+\.html$'
1201
1202     IE_NAME = u'arte.tv'
1203
1204     def __init__(self, downloader=None):
1205         InfoExtractor.__init__(self, downloader)
1206
1207     def report_download_webpage(self, video_id):
1208         """Report webpage download."""
1209         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1210
1211     def report_extraction(self, video_id):
1212         """Report information extraction."""
1213         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1214
1215     def fetch_webpage(self, url):
1216         request = compat_urllib_request.Request(url)
1217         try:
1218             self.report_download_webpage(url)
1219             webpage = compat_urllib_request.urlopen(request).read()
1220         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1221             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1222             return
1223         except ValueError as err:
1224             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1225             return
1226         return webpage
1227
1228     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1229         page = self.fetch_webpage(url)
1230         mobj = re.search(regex, page, regexFlags)
1231         info = {}
1232
1233         if mobj is None:
1234             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1235             return
1236
1237         for (i, key, err) in matchTuples:
1238             if mobj.group(i) is None:
1239                 self._downloader.trouble(err)
1240                 return
1241             else:
1242                 info[key] = mobj.group(i)
1243
1244         return info
1245
1246     def extractLiveStream(self, url):
1247         video_lang = url.split('/')[-4]
1248         info = self.grep_webpage(
1249             url,
1250             r'src="(.*?/videothek_js.*?\.js)',
1251             0,
1252             [
1253                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1254             ]
1255         )
1256         http_host = url.split('/')[2]
1257         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1258         info = self.grep_webpage(
1259             next_url,
1260             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1261                 '(http://.*?\.swf).*?' +
1262                 '(rtmp://.*?)\'',
1263             re.DOTALL,
1264             [
1265                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1266                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1267                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1268             ]
1269         )
1270         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1271
1272     def extractPlus7Stream(self, url):
1273         video_lang = url.split('/')[-3]
1274         info = self.grep_webpage(
1275             url,
1276             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1277             0,
1278             [
1279                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1280             ]
1281         )
1282         next_url = compat_urllib_parse.unquote(info.get('url'))
1283         info = self.grep_webpage(
1284             next_url,
1285             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1286             0,
1287             [
1288                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1289             ]
1290         )
1291         next_url = compat_urllib_parse.unquote(info.get('url'))
1292
1293         info = self.grep_webpage(
1294             next_url,
1295             r'<video id="(.*?)".*?>.*?' +
1296                 '<name>(.*?)</name>.*?' +
1297                 '<dateVideo>(.*?)</dateVideo>.*?' +
1298                 '<url quality="hd">(.*?)</url>',
1299             re.DOTALL,
1300             [
1301                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1302                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1303                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1304                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1305             ]
1306         )
1307
1308         return {
1309             'id':           info.get('id'),
1310             'url':          compat_urllib_parse.unquote(info.get('url')),
1311             'uploader':     u'arte.tv',
1312             'upload_date':  info.get('date'),
1313             'title':        info.get('title').decode('utf-8'),
1314             'ext':          u'mp4',
1315             'format':       u'NA',
1316             'player_url':   None,
1317         }
1318
1319     def _real_extract(self, url):
1320         video_id = url.split('/')[-1]
1321         self.report_extraction(video_id)
1322
1323         if re.search(self._LIVE_URL, video_id) is not None:
1324             self.extractLiveStream(url)
1325             return
1326         else:
1327             info = self.extractPlus7Stream(url)
1328
1329         return [info]
1330
1331
1332 class GenericIE(InfoExtractor):
1333     """Generic last-resort information extractor."""
1334
1335     _VALID_URL = r'.*'
1336     IE_NAME = u'generic'
1337
1338     def __init__(self, downloader=None):
1339         InfoExtractor.__init__(self, downloader)
1340
1341     def report_download_webpage(self, video_id):
1342         """Report webpage download."""
1343         if not self._downloader.params.get('test', False):
1344             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1345         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1346
1347     def report_extraction(self, video_id):
1348         """Report information extraction."""
1349         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1350
1351     def report_following_redirect(self, new_url):
1352         """Report information extraction."""
1353         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1354
1355     def _test_redirect(self, url):
1356         """Check if it is a redirect, like url shorteners, in case return the new url."""
1357         class HeadRequest(compat_urllib_request.Request):
1358             def get_method(self):
1359                 return "HEAD"
1360
1361         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1362             """
1363             Subclass the HTTPRedirectHandler to make it use our
1364             HeadRequest also on the redirected URL
1365             """
1366             def redirect_request(self, req, fp, code, msg, headers, newurl):
1367                 if code in (301, 302, 303, 307):
1368                     newurl = newurl.replace(' ', '%20')
1369                     newheaders = dict((k,v) for k,v in req.headers.items()
1370                                       if k.lower() not in ("content-length", "content-type"))
1371                     return HeadRequest(newurl,
1372                                        headers=newheaders,
1373                                        origin_req_host=req.get_origin_req_host(),
1374                                        unverifiable=True)
1375                 else:
1376                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1377
1378         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1379             """
1380             Fallback to GET if HEAD is not allowed (405 HTTP error)
1381             """
1382             def http_error_405(self, req, fp, code, msg, headers):
1383                 fp.read()
1384                 fp.close()
1385
1386                 newheaders = dict((k,v) for k,v in req.headers.items()
1387                                   if k.lower() not in ("content-length", "content-type"))
1388                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1389                                                  headers=newheaders,
1390                                                  origin_req_host=req.get_origin_req_host(),
1391                                                  unverifiable=True))
1392
1393         # Build our opener
1394         opener = compat_urllib_request.OpenerDirector()
1395         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1396                         HTTPMethodFallback, HEADRedirectHandler,
1397                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1398             opener.add_handler(handler())
1399
1400         response = opener.open(HeadRequest(url))
1401         new_url = response.geturl()
1402
1403         if url == new_url:
1404             return False
1405
1406         self.report_following_redirect(new_url)
1407         return new_url
1408
1409     def _real_extract(self, url):
1410         new_url = self._test_redirect(url)
1411         if new_url: return [self.url_result(new_url)]
1412
1413         video_id = url.split('/')[-1]
1414         try:
1415             webpage = self._download_webpage(url, video_id)
1416         except ValueError as err:
1417             # since this is the last-resort InfoExtractor, if
1418             # this error is thrown, it'll be thrown here
1419             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1420             return
1421
1422         self.report_extraction(video_id)
1423         # Start with something easy: JW Player in SWFObject
1424         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1425         if mobj is None:
1426             # Broaden the search a little bit
1427             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1428         if mobj is None:
1429             # Broaden the search a little bit: JWPlayer JS loader
1430             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1431         if mobj is None:
1432             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1433             return
1434
1435         # It's possible that one of the regexes
1436         # matched, but returned an empty group:
1437         if mobj.group(1) is None:
1438             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1439             return
1440
1441         video_url = compat_urllib_parse.unquote(mobj.group(1))
1442         video_id = os.path.basename(video_url)
1443
1444         # here's a fun little line of code for you:
1445         video_extension = os.path.splitext(video_id)[1][1:]
1446         video_id = os.path.splitext(video_id)[0]
1447
1448         # it's tempting to parse this further, but you would
1449         # have to take into account all the variations like
1450         #   Video Title - Site Name
1451         #   Site Name | Video Title
1452         #   Video Title - Tagline | Site Name
1453         # and so on and so forth; it's just not practical
1454         mobj = re.search(r'<title>(.*)</title>', webpage)
1455         if mobj is None:
1456             self._downloader.trouble(u'ERROR: unable to extract title')
1457             return
1458         video_title = mobj.group(1)
1459
1460         # video uploader is domain name
1461         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1462         if mobj is None:
1463             self._downloader.trouble(u'ERROR: unable to extract title')
1464             return
1465         video_uploader = mobj.group(1)
1466
1467         return [{
1468             'id':       video_id,
1469             'url':      video_url,
1470             'uploader': video_uploader,
1471             'upload_date':  None,
1472             'title':    video_title,
1473             'ext':      video_extension,
1474         }]
1475
1476
1477 class YoutubeSearchIE(InfoExtractor):
1478     """Information Extractor for YouTube search queries."""
1479     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1480     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1481     _max_youtube_results = 1000
1482     IE_NAME = u'youtube:search'
1483
1484     def __init__(self, downloader=None):
1485         InfoExtractor.__init__(self, downloader)
1486
1487     def report_download_page(self, query, pagenum):
1488         """Report attempt to download search page with given number."""
1489         query = query.decode(preferredencoding())
1490         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1491
1492     def _real_extract(self, query):
1493         mobj = re.match(self._VALID_URL, query)
1494         if mobj is None:
1495             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1496             return
1497
1498         prefix, query = query.split(':')
1499         prefix = prefix[8:]
1500         query = query.encode('utf-8')
1501         if prefix == '':
1502             self._download_n_results(query, 1)
1503             return
1504         elif prefix == 'all':
1505             self._download_n_results(query, self._max_youtube_results)
1506             return
1507         else:
1508             try:
1509                 n = int(prefix)
1510                 if n <= 0:
1511                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1512                     return
1513                 elif n > self._max_youtube_results:
1514                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1515                     n = self._max_youtube_results
1516                 self._download_n_results(query, n)
1517                 return
1518             except ValueError: # parsing prefix as integer fails
1519                 self._download_n_results(query, 1)
1520                 return
1521
1522     def _download_n_results(self, query, n):
1523         """Downloads a specified number of results for a query"""
1524
1525         video_ids = []
1526         pagenum = 0
1527         limit = n
1528
1529         while (50 * pagenum) < limit:
1530             self.report_download_page(query, pagenum+1)
1531             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1532             request = compat_urllib_request.Request(result_url)
1533             try:
1534                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1535             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1536                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1537                 return
1538             api_response = json.loads(data)['data']
1539
1540             if not 'items' in api_response:
1541                 self._downloader.trouble(u'[youtube] No video results')
1542                 return
1543
1544             new_ids = list(video['id'] for video in api_response['items'])
1545             video_ids += new_ids
1546
1547             limit = min(n, api_response['totalItems'])
1548             pagenum += 1
1549
1550         if len(video_ids) > n:
1551             video_ids = video_ids[:n]
1552         for id in video_ids:
1553             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1554         return
1555
1556
1557 class GoogleSearchIE(InfoExtractor):
1558     """Information Extractor for Google Video search queries."""
1559     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1560     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1561     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1562     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1563     _max_google_results = 1000
1564     IE_NAME = u'video.google:search'
1565
1566     def __init__(self, downloader=None):
1567         InfoExtractor.__init__(self, downloader)
1568
1569     def report_download_page(self, query, pagenum):
1570         """Report attempt to download playlist page with given number."""
1571         query = query.decode(preferredencoding())
1572         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1573
1574     def _real_extract(self, query):
1575         mobj = re.match(self._VALID_URL, query)
1576         if mobj is None:
1577             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1578             return
1579
1580         prefix, query = query.split(':')
1581         prefix = prefix[8:]
1582         query = query.encode('utf-8')
1583         if prefix == '':
1584             self._download_n_results(query, 1)
1585             return
1586         elif prefix == 'all':
1587             self._download_n_results(query, self._max_google_results)
1588             return
1589         else:
1590             try:
1591                 n = int(prefix)
1592                 if n <= 0:
1593                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1594                     return
1595                 elif n > self._max_google_results:
1596                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1597                     n = self._max_google_results
1598                 self._download_n_results(query, n)
1599                 return
1600             except ValueError: # parsing prefix as integer fails
1601                 self._download_n_results(query, 1)
1602                 return
1603
1604     def _download_n_results(self, query, n):
1605         """Downloads a specified number of results for a query"""
1606
1607         video_ids = []
1608         pagenum = 0
1609
1610         while True:
1611             self.report_download_page(query, pagenum)
1612             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1613             request = compat_urllib_request.Request(result_url)
1614             try:
1615                 page = compat_urllib_request.urlopen(request).read()
1616             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1617                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1618                 return
1619
1620             # Extract video identifiers
1621             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1622                 video_id = mobj.group(1)
1623                 if video_id not in video_ids:
1624                     video_ids.append(video_id)
1625                     if len(video_ids) == n:
1626                         # Specified n videos reached
1627                         for id in video_ids:
1628                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1629                         return
1630
1631             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1632                 for id in video_ids:
1633                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1634                 return
1635
1636             pagenum = pagenum + 1
1637
1638
1639 class YahooSearchIE(InfoExtractor):
1640     """Information Extractor for Yahoo! Video search queries."""
1641
1642     _WORKING = False
1643     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1644     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1645     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1646     _MORE_PAGES_INDICATOR = r'\s*Next'
1647     _max_yahoo_results = 1000
1648     IE_NAME = u'video.yahoo:search'
1649
1650     def __init__(self, downloader=None):
1651         InfoExtractor.__init__(self, downloader)
1652
1653     def report_download_page(self, query, pagenum):
1654         """Report attempt to download playlist page with given number."""
1655         query = query.decode(preferredencoding())
1656         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1657
1658     def _real_extract(self, query):
1659         mobj = re.match(self._VALID_URL, query)
1660         if mobj is None:
1661             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1662             return
1663
1664         prefix, query = query.split(':')
1665         prefix = prefix[8:]
1666         query = query.encode('utf-8')
1667         if prefix == '':
1668             self._download_n_results(query, 1)
1669             return
1670         elif prefix == 'all':
1671             self._download_n_results(query, self._max_yahoo_results)
1672             return
1673         else:
1674             try:
1675                 n = int(prefix)
1676                 if n <= 0:
1677                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1678                     return
1679                 elif n > self._max_yahoo_results:
1680                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1681                     n = self._max_yahoo_results
1682                 self._download_n_results(query, n)
1683                 return
1684             except ValueError: # parsing prefix as integer fails
1685                 self._download_n_results(query, 1)
1686                 return
1687
1688     def _download_n_results(self, query, n):
1689         """Downloads a specified number of results for a query"""
1690
1691         video_ids = []
1692         already_seen = set()
1693         pagenum = 1
1694
1695         while True:
1696             self.report_download_page(query, pagenum)
1697             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1698             request = compat_urllib_request.Request(result_url)
1699             try:
1700                 page = compat_urllib_request.urlopen(request).read()
1701             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1702                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1703                 return
1704
1705             # Extract video identifiers
1706             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1707                 video_id = mobj.group(1)
1708                 if video_id not in already_seen:
1709                     video_ids.append(video_id)
1710                     already_seen.add(video_id)
1711                     if len(video_ids) == n:
1712                         # Specified n videos reached
1713                         for id in video_ids:
1714                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1715                         return
1716
1717             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1718                 for id in video_ids:
1719                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1720                 return
1721
1722             pagenum = pagenum + 1
1723
1724
1725 class YoutubePlaylistIE(InfoExtractor):
1726     """Information Extractor for YouTube playlists."""
1727
1728     _VALID_URL = r"""(?:
1729                         (?:https?://)?
1730                         (?:\w+\.)?
1731                         youtube\.com/
1732                         (?:
1733                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1734                            \? (?:.*?&)*? (?:p|a|list)=
1735                         |  user/.*?/user/
1736                         |  p/
1737                         |  user/.*?#[pg]/c/
1738                         )
1739                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1740                         .*
1741                      |
1742                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1743                      )"""
1744     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1745     _MAX_RESULTS = 50
1746     IE_NAME = u'youtube:playlist'
1747
1748     def __init__(self, downloader=None):
1749         InfoExtractor.__init__(self, downloader)
1750
1751     @classmethod
1752     def suitable(cls, url):
1753         """Receives a URL and returns True if suitable for this IE."""
1754         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1755
1756     def report_download_page(self, playlist_id, pagenum):
1757         """Report attempt to download playlist page with given number."""
1758         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1759
1760     def _real_extract(self, url):
1761         # Extract playlist id
1762         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1763         if mobj is None:
1764             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1765             return
1766
1767         # Download playlist videos from API
1768         playlist_id = mobj.group(1) or mobj.group(2)
1769         page_num = 1
1770         videos = []
1771
1772         while True:
1773             self.report_download_page(playlist_id, page_num)
1774
1775             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1776             try:
1777                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1778             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1779                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1780                 return
1781
1782             try:
1783                 response = json.loads(page)
1784             except ValueError as err:
1785                 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1786                 return
1787
1788             if not 'feed' in response or not 'entry' in response['feed']:
1789                 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1790                 return
1791             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1792                         for entry in response['feed']['entry']
1793                         if 'content' in entry ]
1794
1795             if len(response['feed']['entry']) < self._MAX_RESULTS:
1796                 break
1797             page_num += 1
1798
1799         videos = [v[1] for v in sorted(videos)]
1800         total = len(videos)
1801
1802         playliststart = self._downloader.params.get('playliststart', 1) - 1
1803         playlistend = self._downloader.params.get('playlistend', -1)
1804         if playlistend == -1:
1805             videos = videos[playliststart:]
1806         else:
1807             videos = videos[playliststart:playlistend]
1808
1809         if len(videos) == total:
1810             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1811         else:
1812             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1813
1814         url_results = [self.url_result(url) for url in videos]
1815         return [self.playlist_result(url_results, playlist_id)]
1816
1817
1818 class YoutubeChannelIE(InfoExtractor):
1819     """Information Extractor for YouTube channels."""
1820
1821     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1822     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1823     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1824     IE_NAME = u'youtube:channel'
1825
1826     def report_download_page(self, channel_id, pagenum):
1827         """Report attempt to download channel page with given number."""
1828         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1829
1830     def _real_extract(self, url):
1831         # Extract channel id
1832         mobj = re.match(self._VALID_URL, url)
1833         if mobj is None:
1834             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1835             return
1836
1837         # Download channel pages
1838         channel_id = mobj.group(1)
1839         video_ids = []
1840         pagenum = 1
1841
1842         while True:
1843             self.report_download_page(channel_id, pagenum)
1844             url = self._TEMPLATE_URL % (channel_id, pagenum)
1845             request = compat_urllib_request.Request(url)
1846             try:
1847                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1848             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1849                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1850                 return
1851
1852             # Extract video identifiers
1853             ids_in_page = []
1854             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1855                 if mobj.group(1) not in ids_in_page:
1856                     ids_in_page.append(mobj.group(1))
1857             video_ids.extend(ids_in_page)
1858
1859             if self._MORE_PAGES_INDICATOR not in page:
1860                 break
1861             pagenum = pagenum + 1
1862
1863         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1864
1865         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1866         url_entries = [self.url_result(url) for url in urls]
1867         return [self.playlist_result(url_entries, channel_id)]
1868
1869
1870 class YoutubeUserIE(InfoExtractor):
1871     """Information Extractor for YouTube users."""
1872
1873     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1874     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1875     _GDATA_PAGE_SIZE = 50
1876     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1877     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1878     IE_NAME = u'youtube:user'
1879
1880     def __init__(self, downloader=None):
1881         InfoExtractor.__init__(self, downloader)
1882
1883     def report_download_page(self, username, start_index):
1884         """Report attempt to download user page."""
1885         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1886                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1887
1888     def _real_extract(self, url):
1889         # Extract username
1890         mobj = re.match(self._VALID_URL, url)
1891         if mobj is None:
1892             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1893             return
1894
1895         username = mobj.group(1)
1896
1897         # Download video ids using YouTube Data API. Result size per
1898         # query is limited (currently to 50 videos) so we need to query
1899         # page by page until there are no video ids - it means we got
1900         # all of them.
1901
1902         video_ids = []
1903         pagenum = 0
1904
1905         while True:
1906             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1907             self.report_download_page(username, start_index)
1908
1909             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1910
1911             try:
1912                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1913             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1914                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1915                 return
1916
1917             # Extract video identifiers
1918             ids_in_page = []
1919
1920             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1921                 if mobj.group(1) not in ids_in_page:
1922                     ids_in_page.append(mobj.group(1))
1923
1924             video_ids.extend(ids_in_page)
1925
1926             # A little optimization - if current page is not
1927             # "full", ie. does not contain PAGE_SIZE video ids then
1928             # we can assume that this page is the last one - there
1929             # are no more ids on further pages - no need to query
1930             # again.
1931
1932             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1933                 break
1934
1935             pagenum += 1
1936
1937         all_ids_count = len(video_ids)
1938         playliststart = self._downloader.params.get('playliststart', 1) - 1
1939         playlistend = self._downloader.params.get('playlistend', -1)
1940
1941         if playlistend == -1:
1942             video_ids = video_ids[playliststart:]
1943         else:
1944             video_ids = video_ids[playliststart:playlistend]
1945
1946         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1947                 (username, all_ids_count, len(video_ids)))
1948
1949         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1950         url_results = [self.url_result(url) for url in urls]
1951         return [self.playlist_result(url_results, playlist_title = username)]
1952
1953
1954 class BlipTVUserIE(InfoExtractor):
1955     """Information Extractor for blip.tv users."""
1956
1957     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1958     _PAGE_SIZE = 12
1959     IE_NAME = u'blip.tv:user'
1960
1961     def __init__(self, downloader=None):
1962         InfoExtractor.__init__(self, downloader)
1963
1964     def report_download_page(self, username, pagenum):
1965         """Report attempt to download user page."""
1966         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1967                 (self.IE_NAME, username, pagenum))
1968
1969     def _real_extract(self, url):
1970         # Extract username
1971         mobj = re.match(self._VALID_URL, url)
1972         if mobj is None:
1973             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1974             return
1975
1976         username = mobj.group(1)
1977
1978         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1979
1980         request = compat_urllib_request.Request(url)
1981
1982         try:
1983             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1984             mobj = re.search(r'data-users-id="([^"]+)"', page)
1985             page_base = page_base % mobj.group(1)
1986         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1987             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1988             return
1989
1990
1991         # Download video ids using BlipTV Ajax calls. Result size per
1992         # query is limited (currently to 12 videos) so we need to query
1993         # page by page until there are no video ids - it means we got
1994         # all of them.
1995
1996         video_ids = []
1997         pagenum = 1
1998
1999         while True:
2000             self.report_download_page(username, pagenum)
2001             url = page_base + "&page=" + str(pagenum)
2002             request = compat_urllib_request.Request( url )
2003             try:
2004                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2005             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2006                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2007                 return
2008
2009             # Extract video identifiers
2010             ids_in_page = []
2011
2012             for mobj in re.finditer(r'href="/([^"]+)"', page):
2013                 if mobj.group(1) not in ids_in_page:
2014                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2015
2016             video_ids.extend(ids_in_page)
2017
2018             # A little optimization - if current page is not
2019             # "full", ie. does not contain PAGE_SIZE video ids then
2020             # we can assume that this page is the last one - there
2021             # are no more ids on further pages - no need to query
2022             # again.
2023
2024             if len(ids_in_page) < self._PAGE_SIZE:
2025                 break
2026
2027             pagenum += 1
2028
2029         all_ids_count = len(video_ids)
2030         playliststart = self._downloader.params.get('playliststart', 1) - 1
2031         playlistend = self._downloader.params.get('playlistend', -1)
2032
2033         if playlistend == -1:
2034             video_ids = video_ids[playliststart:]
2035         else:
2036             video_ids = video_ids[playliststart:playlistend]
2037
2038         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2039                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2040
2041         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2042         url_entries = [self.url_result(url) for url in urls]
2043         return [self.playlist_result(url_entries, playlist_title = username)]
2044
2045
2046 class DepositFilesIE(InfoExtractor):
2047     """Information extractor for depositfiles.com"""
2048
2049     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2050
2051     def report_download_webpage(self, file_id):
2052         """Report webpage download."""
2053         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2054
2055     def report_extraction(self, file_id):
2056         """Report information extraction."""
2057         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2058
2059     def _real_extract(self, url):
2060         file_id = url.split('/')[-1]
2061         # Rebuild url in english locale
2062         url = 'http://depositfiles.com/en/files/' + file_id
2063
2064         # Retrieve file webpage with 'Free download' button pressed
2065         free_download_indication = { 'gateway_result' : '1' }
2066         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2067         try:
2068             self.report_download_webpage(file_id)
2069             webpage = compat_urllib_request.urlopen(request).read()
2070         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2071             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2072             return
2073
2074         # Search for the real file URL
2075         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2076         if (mobj is None) or (mobj.group(1) is None):
2077             # Try to figure out reason of the error.
2078             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2079             if (mobj is not None) and (mobj.group(1) is not None):
2080                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2081                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2082             else:
2083                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2084             return
2085
2086         file_url = mobj.group(1)
2087         file_extension = os.path.splitext(file_url)[1][1:]
2088
2089         # Search for file title
2090         mobj = re.search(r'<b title="(.*?)">', webpage)
2091         if mobj is None:
2092             self._downloader.trouble(u'ERROR: unable to extract title')
2093             return
2094         file_title = mobj.group(1).decode('utf-8')
2095
2096         return [{
2097             'id':       file_id.decode('utf-8'),
2098             'url':      file_url.decode('utf-8'),
2099             'uploader': None,
2100             'upload_date':  None,
2101             'title':    file_title,
2102             'ext':      file_extension.decode('utf-8'),
2103         }]
2104
2105
2106 class FacebookIE(InfoExtractor):
2107     """Information Extractor for Facebook"""
2108
2109     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2110     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2111     _NETRC_MACHINE = 'facebook'
2112     IE_NAME = u'facebook'
2113
2114     def report_login(self):
2115         """Report attempt to log in."""
2116         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2117
2118     def _real_initialize(self):
2119         if self._downloader is None:
2120             return
2121
2122         useremail = None
2123         password = None
2124         downloader_params = self._downloader.params
2125
2126         # Attempt to use provided username and password or .netrc data
2127         if downloader_params.get('username', None) is not None:
2128             useremail = downloader_params['username']
2129             password = downloader_params['password']
2130         elif downloader_params.get('usenetrc', False):
2131             try:
2132                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2133                 if info is not None:
2134                     useremail = info[0]
2135                     password = info[2]
2136                 else:
2137                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2138             except (IOError, netrc.NetrcParseError) as err:
2139                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2140                 return
2141
2142         if useremail is None:
2143             return
2144
2145         # Log in
2146         login_form = {
2147             'email': useremail,
2148             'pass': password,
2149             'login': 'Log+In'
2150             }
2151         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2152         try:
2153             self.report_login()
2154             login_results = compat_urllib_request.urlopen(request).read()
2155             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2156                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2157                 return
2158         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2159             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2160             return
2161
2162     def _real_extract(self, url):
2163         mobj = re.match(self._VALID_URL, url)
2164         if mobj is None:
2165             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2166             return
2167         video_id = mobj.group('ID')
2168
2169         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2170         webpage = self._download_webpage(url, video_id)
2171
2172         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2173         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2174         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2175         if not m:
2176             raise ExtractorError(u'Cannot parse data')
2177         data = dict(json.loads(m.group(1)))
2178         params_raw = compat_urllib_parse.unquote(data['params'])
2179         params = json.loads(params_raw)
2180         video_url = params['hd_src']
2181         if not video_url:
2182             video_url = params['sd_src']
2183         if not video_url:
2184             raise ExtractorError(u'Cannot find video URL')
2185         video_duration = int(params['video_duration'])
2186
2187         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2188         if not m:
2189             raise ExtractorError(u'Cannot find title in webpage')
2190         video_title = unescapeHTML(m.group(1))
2191
2192         info = {
2193             'id': video_id,
2194             'title': video_title,
2195             'url': video_url,
2196             'ext': 'mp4',
2197             'duration': video_duration,
2198             'thumbnail': params['thumbnail_src'],
2199         }
2200         return [info]
2201
2202
2203 class BlipTVIE(InfoExtractor):
2204     """Information extractor for blip.tv"""
2205
2206     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2207     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2208     IE_NAME = u'blip.tv'
2209
2210     def report_extraction(self, file_id):
2211         """Report information extraction."""
2212         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2213
2214     def report_direct_download(self, title):
2215         """Report information extraction."""
2216         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2217
2218     def _real_extract(self, url):
2219         mobj = re.match(self._VALID_URL, url)
2220         if mobj is None:
2221             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2222             return
2223
2224         urlp = compat_urllib_parse_urlparse(url)
2225         if urlp.path.startswith('/play/'):
2226             request = compat_urllib_request.Request(url)
2227             response = compat_urllib_request.urlopen(request)
2228             redirecturl = response.geturl()
2229             rurlp = compat_urllib_parse_urlparse(redirecturl)
2230             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2231             url = 'http://blip.tv/a/a-' + file_id
2232             return self._real_extract(url)
2233
2234
2235         if '?' in url:
2236             cchar = '&'
2237         else:
2238             cchar = '?'
2239         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2240         request = compat_urllib_request.Request(json_url)
2241         request.add_header('User-Agent', 'iTunes/10.6.1')
2242         self.report_extraction(mobj.group(1))
2243         info = None
2244         try:
2245             urlh = compat_urllib_request.urlopen(request)
2246             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2247                 basename = url.split('/')[-1]
2248                 title,ext = os.path.splitext(basename)
2249                 title = title.decode('UTF-8')
2250                 ext = ext.replace('.', '')
2251                 self.report_direct_download(title)
2252                 info = {
2253                     'id': title,
2254                     'url': url,
2255                     'uploader': None,
2256                     'upload_date': None,
2257                     'title': title,
2258                     'ext': ext,
2259                     'urlhandle': urlh
2260                 }
2261         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2262             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2263         if info is None: # Regular URL
2264             try:
2265                 json_code_bytes = urlh.read()
2266                 json_code = json_code_bytes.decode('utf-8')
2267             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2268                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2269                 return
2270
2271             try:
2272                 json_data = json.loads(json_code)
2273                 if 'Post' in json_data:
2274                     data = json_data['Post']
2275                 else:
2276                     data = json_data
2277
2278                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2279                 video_url = data['media']['url']
2280                 umobj = re.match(self._URL_EXT, video_url)
2281                 if umobj is None:
2282                     raise ValueError('Can not determine filename extension')
2283                 ext = umobj.group(1)
2284
2285                 info = {
2286                     'id': data['item_id'],
2287                     'url': video_url,
2288                     'uploader': data['display_name'],
2289                     'upload_date': upload_date,
2290                     'title': data['title'],
2291                     'ext': ext,
2292                     'format': data['media']['mimeType'],
2293                     'thumbnail': data['thumbnailUrl'],
2294                     'description': data['description'],
2295                     'player_url': data['embedUrl'],
2296                     'user_agent': 'iTunes/10.6.1',
2297                 }
2298             except (ValueError,KeyError) as err:
2299                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2300                 return
2301
2302         return [info]
2303
2304
2305 class MyVideoIE(InfoExtractor):
2306     """Information Extractor for myvideo.de."""
2307
2308     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2309     IE_NAME = u'myvideo'
2310
2311     def __init__(self, downloader=None):
2312         InfoExtractor.__init__(self, downloader)
2313
2314     def report_extraction(self, video_id):
2315         """Report information extraction."""
2316         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2317
2318     def _real_extract(self,url):
2319         mobj = re.match(self._VALID_URL, url)
2320         if mobj is None:
2321             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2322             return
2323
2324         video_id = mobj.group(1)
2325
2326         # Get video webpage
2327         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2328         webpage = self._download_webpage(webpage_url, video_id)
2329
2330         self.report_extraction(video_id)
2331         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2332                  webpage)
2333         if mobj is None:
2334             self._downloader.trouble(u'ERROR: unable to extract media URL')
2335             return
2336         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2337
2338         mobj = re.search('<title>([^<]+)</title>', webpage)
2339         if mobj is None:
2340             self._downloader.trouble(u'ERROR: unable to extract title')
2341             return
2342
2343         video_title = mobj.group(1)
2344
2345         return [{
2346             'id':       video_id,
2347             'url':      video_url,
2348             'uploader': None,
2349             'upload_date':  None,
2350             'title':    video_title,
2351             'ext':      u'flv',
2352         }]
2353
2354 class ComedyCentralIE(InfoExtractor):
2355     """Information extractor for The Daily Show and Colbert Report """
2356
2357     # urls can be abbreviations like :thedailyshow or :colbert
2358     # urls for episodes like:
2359     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2360     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2361     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2362     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2363                       |(https?://)?(www\.)?
2364                           (?P<showname>thedailyshow|colbertnation)\.com/
2365                          (full-episodes/(?P<episode>.*)|
2366                           (?P<clip>
2367                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2368                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2369                      $"""
2370
2371     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2372
2373     _video_extensions = {
2374         '3500': 'mp4',
2375         '2200': 'mp4',
2376         '1700': 'mp4',
2377         '1200': 'mp4',
2378         '750': 'mp4',
2379         '400': 'mp4',
2380     }
2381     _video_dimensions = {
2382         '3500': '1280x720',
2383         '2200': '960x540',
2384         '1700': '768x432',
2385         '1200': '640x360',
2386         '750': '512x288',
2387         '400': '384x216',
2388     }
2389
2390     @classmethod
2391     def suitable(cls, url):
2392         """Receives a URL and returns True if suitable for this IE."""
2393         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2394
2395     def report_extraction(self, episode_id):
2396         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2397
2398     def report_config_download(self, episode_id, media_id):
2399         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2400
2401     def report_index_download(self, episode_id):
2402         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2403
2404     def _print_formats(self, formats):
2405         print('Available formats:')
2406         for x in formats:
2407             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2408
2409
2410     def _real_extract(self, url):
2411         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2412         if mobj is None:
2413             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2414             return
2415
2416         if mobj.group('shortname'):
2417             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2418                 url = u'http://www.thedailyshow.com/full-episodes/'
2419             else:
2420                 url = u'http://www.colbertnation.com/full-episodes/'
2421             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2422             assert mobj is not None
2423
2424         if mobj.group('clip'):
2425             if mobj.group('showname') == 'thedailyshow':
2426                 epTitle = mobj.group('tdstitle')
2427             else:
2428                 epTitle = mobj.group('cntitle')
2429             dlNewest = False
2430         else:
2431             dlNewest = not mobj.group('episode')
2432             if dlNewest:
2433                 epTitle = mobj.group('showname')
2434             else:
2435                 epTitle = mobj.group('episode')
2436
2437         req = compat_urllib_request.Request(url)
2438         self.report_extraction(epTitle)
2439         try:
2440             htmlHandle = compat_urllib_request.urlopen(req)
2441             html = htmlHandle.read()
2442             webpage = html.decode('utf-8')
2443         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2444             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2445             return
2446         if dlNewest:
2447             url = htmlHandle.geturl()
2448             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2449             if mobj is None:
2450                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2451                 return
2452             if mobj.group('episode') == '':
2453                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2454                 return
2455             epTitle = mobj.group('episode')
2456
2457         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2458
2459         if len(mMovieParams) == 0:
2460             # The Colbert Report embeds the information in a without
2461             # a URL prefix; so extract the alternate reference
2462             # and then add the URL prefix manually.
2463
2464             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2465             if len(altMovieParams) == 0:
2466                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2467                 return
2468             else:
2469                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2470
2471         uri = mMovieParams[0][1]
2472         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2473         self.report_index_download(epTitle)
2474         try:
2475             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2476         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2477             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2478             return
2479
2480         results = []
2481
2482         idoc = xml.etree.ElementTree.fromstring(indexXml)
2483         itemEls = idoc.findall('.//item')
2484         for partNum,itemEl in enumerate(itemEls):
2485             mediaId = itemEl.findall('./guid')[0].text
2486             shortMediaId = mediaId.split(':')[-1]
2487             showId = mediaId.split(':')[-2].replace('.com', '')
2488             officialTitle = itemEl.findall('./title')[0].text
2489             officialDate = itemEl.findall('./pubDate')[0].text
2490
2491             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2492                         compat_urllib_parse.urlencode({'uri': mediaId}))
2493             configReq = compat_urllib_request.Request(configUrl)
2494             self.report_config_download(epTitle, shortMediaId)
2495             try:
2496                 configXml = compat_urllib_request.urlopen(configReq).read()
2497             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2498                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2499                 return
2500
2501             cdoc = xml.etree.ElementTree.fromstring(configXml)
2502             turls = []
2503             for rendition in cdoc.findall('.//rendition'):
2504                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2505                 turls.append(finfo)
2506
2507             if len(turls) == 0:
2508                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2509                 continue
2510
2511             if self._downloader.params.get('listformats', None):
2512                 self._print_formats([i[0] for i in turls])
2513                 return
2514
2515             # For now, just pick the highest bitrate
2516             format,rtmp_video_url = turls[-1]
2517
2518             # Get the format arg from the arg stream
2519             req_format = self._downloader.params.get('format', None)
2520
2521             # Select format if we can find one
2522             for f,v in turls:
2523                 if f == req_format:
2524                     format, rtmp_video_url = f, v
2525                     break
2526
2527             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2528             if not m:
2529                 raise ExtractorError(u'Cannot transform RTMP url')
2530             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2531             video_url = base + m.group('finalid')
2532
2533             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2534             info = {
2535                 'id': shortMediaId,
2536                 'url': video_url,
2537                 'uploader': showId,
2538                 'upload_date': officialDate,
2539                 'title': effTitle,
2540                 'ext': 'mp4',
2541                 'format': format,
2542                 'thumbnail': None,
2543                 'description': officialTitle,
2544             }
2545             results.append(info)
2546
2547         return results
2548
2549
2550 class EscapistIE(InfoExtractor):
2551     """Information extractor for The Escapist """
2552
2553     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2554     IE_NAME = u'escapist'
2555
2556     def report_extraction(self, showName):
2557         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2558
2559     def report_config_download(self, showName):
2560         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2561
2562     def _real_extract(self, url):
2563         mobj = re.match(self._VALID_URL, url)
2564         if mobj is None:
2565             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2566             return
2567         showName = mobj.group('showname')
2568         videoId = mobj.group('episode')
2569
2570         self.report_extraction(showName)
2571         try:
2572             webPage = compat_urllib_request.urlopen(url)
2573             webPageBytes = webPage.read()
2574             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2575             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2576         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2577             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2578             return
2579
2580         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2581         description = unescapeHTML(descMatch.group(1))
2582         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2583         imgUrl = unescapeHTML(imgMatch.group(1))
2584         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2585         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2586         configUrlMatch = re.search('config=(.*)$', playerUrl)
2587         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2588
2589         self.report_config_download(showName)
2590         try:
2591             configJSON = compat_urllib_request.urlopen(configUrl)
2592             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2593             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2594         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2595             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2596             return
2597
2598         # Technically, it's JavaScript, not JSON
2599         configJSON = configJSON.replace("'", '"')
2600
2601         try:
2602             config = json.loads(configJSON)
2603         except (ValueError,) as err:
2604             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2605             return
2606
2607         playlist = config['playlist']
2608         videoUrl = playlist[1]['url']
2609
2610         info = {
2611             'id': videoId,
2612             'url': videoUrl,
2613             'uploader': showName,
2614             'upload_date': None,
2615             'title': showName,
2616             'ext': 'mp4',
2617             'thumbnail': imgUrl,
2618             'description': description,
2619             'player_url': playerUrl,
2620         }
2621
2622         return [info]
2623
2624 class CollegeHumorIE(InfoExtractor):
2625     """Information extractor for collegehumor.com"""
2626
2627     _WORKING = False
2628     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2629     IE_NAME = u'collegehumor'
2630
2631     def report_manifest(self, video_id):
2632         """Report information extraction."""
2633         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2634
2635     def report_extraction(self, video_id):
2636         """Report information extraction."""
2637         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2638
2639     def _real_extract(self, url):
2640         mobj = re.match(self._VALID_URL, url)
2641         if mobj is None:
2642             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2643             return
2644         video_id = mobj.group('videoid')
2645
2646         info = {
2647             'id': video_id,
2648             'uploader': None,
2649             'upload_date': None,
2650         }
2651
2652         self.report_extraction(video_id)
2653         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2654         try:
2655             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2656         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2657             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2658             return
2659
2660         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2661         try:
2662             videoNode = mdoc.findall('./video')[0]
2663             info['description'] = videoNode.findall('./description')[0].text
2664             info['title'] = videoNode.findall('./caption')[0].text
2665             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2666             manifest_url = videoNode.findall('./file')[0].text
2667         except IndexError:
2668             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2669             return
2670
2671         manifest_url += '?hdcore=2.10.3'
2672         self.report_manifest(video_id)
2673         try:
2674             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2675         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2676             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2677             return
2678
2679         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2680         try:
2681             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2682             node_id = media_node.attrib['url']
2683             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2684         except IndexError as err:
2685             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2686             return
2687
2688         url_pr = compat_urllib_parse_urlparse(manifest_url)
2689         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2690
2691         info['url'] = url
2692         info['ext'] = 'f4f'
2693         return [info]
2694
2695
2696 class XVideosIE(InfoExtractor):
2697     """Information extractor for xvideos.com"""
2698
2699     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2700     IE_NAME = u'xvideos'
2701
2702     def report_extraction(self, video_id):
2703         """Report information extraction."""
2704         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2705
2706     def _real_extract(self, url):
2707         mobj = re.match(self._VALID_URL, url)
2708         if mobj is None:
2709             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2710             return
2711         video_id = mobj.group(1)
2712
2713         webpage = self._download_webpage(url, video_id)
2714
2715         self.report_extraction(video_id)
2716
2717
2718         # Extract video URL
2719         mobj = re.search(r'flv_url=(.+?)&', webpage)
2720         if mobj is None:
2721             self._downloader.trouble(u'ERROR: unable to extract video url')
2722             return
2723         video_url = compat_urllib_parse.unquote(mobj.group(1))
2724
2725
2726         # Extract title
2727         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2728         if mobj is None:
2729             self._downloader.trouble(u'ERROR: unable to extract video title')
2730             return
2731         video_title = mobj.group(1)
2732
2733
2734         # Extract video thumbnail
2735         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2736         if mobj is None:
2737             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2738             return
2739         video_thumbnail = mobj.group(0)
2740
2741         info = {
2742             'id': video_id,
2743             'url': video_url,
2744             'uploader': None,
2745             'upload_date': None,
2746             'title': video_title,
2747             'ext': 'flv',
2748             'thumbnail': video_thumbnail,
2749             'description': None,
2750         }
2751
2752         return [info]
2753
2754
2755 class SoundcloudIE(InfoExtractor):
2756     """Information extractor for soundcloud.com
2757        To access the media, the uid of the song and a stream token
2758        must be extracted from the page source and the script must make
2759        a request to media.soundcloud.com/crossdomain.xml. Then
2760        the media can be grabbed by requesting from an url composed
2761        of the stream token and uid
2762      """
2763
2764     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2765     IE_NAME = u'soundcloud'
2766
2767     def __init__(self, downloader=None):
2768         InfoExtractor.__init__(self, downloader)
2769
2770     def report_resolve(self, video_id):
2771         """Report information extraction."""
2772         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2773
2774     def report_extraction(self, video_id):
2775         """Report information extraction."""
2776         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2777
2778     def _real_extract(self, url):
2779         mobj = re.match(self._VALID_URL, url)
2780         if mobj is None:
2781             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2782             return
2783
2784         # extract uploader (which is in the url)
2785         uploader = mobj.group(1)
2786         # extract simple title (uploader + slug of song title)
2787         slug_title =  mobj.group(2)
2788         simple_title = uploader + u'-' + slug_title
2789
2790         self.report_resolve('%s/%s' % (uploader, slug_title))
2791
2792         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2793         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2794         request = compat_urllib_request.Request(resolv_url)
2795         try:
2796             info_json_bytes = compat_urllib_request.urlopen(request).read()
2797             info_json = info_json_bytes.decode('utf-8')
2798         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2799             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2800             return
2801
2802         info = json.loads(info_json)
2803         video_id = info['id']
2804         self.report_extraction('%s/%s' % (uploader, slug_title))
2805
2806         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2807         request = compat_urllib_request.Request(streams_url)
2808         try:
2809             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2810             stream_json = stream_json_bytes.decode('utf-8')
2811         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2812             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2813             return
2814
2815         streams = json.loads(stream_json)
2816         mediaURL = streams['http_mp3_128_url']
2817
2818         return [{
2819             'id':       info['id'],
2820             'url':      mediaURL,
2821             'uploader': info['user']['username'],
2822             'upload_date':  info['created_at'],
2823             'title':    info['title'],
2824             'ext':      u'mp3',
2825             'description': info['description'],
2826         }]
2827
2828
2829 class InfoQIE(InfoExtractor):
2830     """Information extractor for infoq.com"""
2831     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2832
2833     def report_extraction(self, video_id):
2834         """Report information extraction."""
2835         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2836
2837     def _real_extract(self, url):
2838         mobj = re.match(self._VALID_URL, url)
2839         if mobj is None:
2840             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2841             return
2842
2843         webpage = self._download_webpage(url, video_id=url)
2844         self.report_extraction(url)
2845
2846         # Extract video URL
2847         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2848         if mobj is None:
2849             self._downloader.trouble(u'ERROR: unable to extract video url')
2850             return
2851         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2852         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2853
2854         # Extract title
2855         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2856         if mobj is None:
2857             self._downloader.trouble(u'ERROR: unable to extract video title')
2858             return
2859         video_title = mobj.group(1)
2860
2861         # Extract description
2862         video_description = u'No description available.'
2863         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2864         if mobj is not None:
2865             video_description = mobj.group(1)
2866
2867         video_filename = video_url.split('/')[-1]
2868         video_id, extension = video_filename.split('.')
2869
2870         info = {
2871             'id': video_id,
2872             'url': video_url,
2873             'uploader': None,
2874             'upload_date': None,
2875             'title': video_title,
2876             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2877             'thumbnail': None,
2878             'description': video_description,
2879         }
2880
2881         return [info]
2882
2883 class MixcloudIE(InfoExtractor):
2884     """Information extractor for www.mixcloud.com"""
2885
2886     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2887     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2888     IE_NAME = u'mixcloud'
2889
2890     def __init__(self, downloader=None):
2891         InfoExtractor.__init__(self, downloader)
2892
2893     def report_download_json(self, file_id):
2894         """Report JSON download."""
2895         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2896
2897     def report_extraction(self, file_id):
2898         """Report information extraction."""
2899         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2900
2901     def get_urls(self, jsonData, fmt, bitrate='best'):
2902         """Get urls from 'audio_formats' section in json"""
2903         file_url = None
2904         try:
2905             bitrate_list = jsonData[fmt]
2906             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2907                 bitrate = max(bitrate_list) # select highest
2908
2909             url_list = jsonData[fmt][bitrate]
2910         except TypeError: # we have no bitrate info.
2911             url_list = jsonData[fmt]
2912         return url_list
2913
2914     def check_urls(self, url_list):
2915         """Returns 1st active url from list"""
2916         for url in url_list:
2917             try:
2918                 compat_urllib_request.urlopen(url)
2919                 return url
2920             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2921                 url = None
2922
2923         return None
2924
2925     def _print_formats(self, formats):
2926         print('Available formats:')
2927         for fmt in formats.keys():
2928             for b in formats[fmt]:
2929                 try:
2930                     ext = formats[fmt][b][0]
2931                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2932                 except TypeError: # we have no bitrate info
2933                     ext = formats[fmt][0]
2934                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2935                     break
2936
2937     def _real_extract(self, url):
2938         mobj = re.match(self._VALID_URL, url)
2939         if mobj is None:
2940             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2941             return
2942         # extract uploader & filename from url
2943         uploader = mobj.group(1).decode('utf-8')
2944         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2945
2946         # construct API request
2947         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2948         # retrieve .json file with links to files
2949         request = compat_urllib_request.Request(file_url)
2950         try:
2951             self.report_download_json(file_url)
2952             jsonData = compat_urllib_request.urlopen(request).read()
2953         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2954             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2955             return
2956
2957         # parse JSON
2958         json_data = json.loads(jsonData)
2959         player_url = json_data['player_swf_url']
2960         formats = dict(json_data['audio_formats'])
2961
2962         req_format = self._downloader.params.get('format', None)
2963         bitrate = None
2964
2965         if self._downloader.params.get('listformats', None):
2966             self._print_formats(formats)
2967             return
2968
2969         if req_format is None or req_format == 'best':
2970             for format_param in formats.keys():
2971                 url_list = self.get_urls(formats, format_param)
2972                 # check urls
2973                 file_url = self.check_urls(url_list)
2974                 if file_url is not None:
2975                     break # got it!
2976         else:
2977             if req_format not in formats:
2978                 self._downloader.trouble(u'ERROR: format is not available')
2979                 return
2980
2981             url_list = self.get_urls(formats, req_format)
2982             file_url = self.check_urls(url_list)
2983             format_param = req_format
2984
2985         return [{
2986             'id': file_id.decode('utf-8'),
2987             'url': file_url.decode('utf-8'),
2988             'uploader': uploader.decode('utf-8'),
2989             'upload_date': None,
2990             'title': json_data['name'],
2991             'ext': file_url.split('.')[-1].decode('utf-8'),
2992             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2993             'thumbnail': json_data['thumbnail_url'],
2994             'description': json_data['description'],
2995             'player_url': player_url.decode('utf-8'),
2996         }]
2997
2998 class StanfordOpenClassroomIE(InfoExtractor):
2999     """Information extractor for Stanford's Open ClassRoom"""
3000
3001     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3002     IE_NAME = u'stanfordoc'
3003
3004     def report_download_webpage(self, objid):
3005         """Report information extraction."""
3006         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3007
3008     def report_extraction(self, video_id):
3009         """Report information extraction."""
3010         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3011
3012     def _real_extract(self, url):
3013         mobj = re.match(self._VALID_URL, url)
3014         if mobj is None:
3015             raise ExtractorError(u'Invalid URL: %s' % url)
3016
3017         if mobj.group('course') and mobj.group('video'): # A specific video
3018             course = mobj.group('course')
3019             video = mobj.group('video')
3020             info = {
3021                 'id': course + '_' + video,
3022                 'uploader': None,
3023                 'upload_date': None,
3024             }
3025
3026             self.report_extraction(info['id'])
3027             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3028             xmlUrl = baseUrl + video + '.xml'
3029             try:
3030                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3031             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3032                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3033                 return
3034             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3035             try:
3036                 info['title'] = mdoc.findall('./title')[0].text
3037                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3038             except IndexError:
3039                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3040                 return
3041             info['ext'] = info['url'].rpartition('.')[2]
3042             return [info]
3043         elif mobj.group('course'): # A course page
3044             course = mobj.group('course')
3045             info = {
3046                 'id': course,
3047                 'type': 'playlist',
3048                 'uploader': None,
3049                 'upload_date': None,
3050             }
3051
3052             coursepage = self._download_webpage(url, info['id'],
3053                                         note='Downloading course info page',
3054                                         errnote='Unable to download course info page')
3055
3056             m = re.search('<h1>([^<]+)</h1>', coursepage)
3057             if m:
3058                 info['title'] = unescapeHTML(m.group(1))
3059             else:
3060                 info['title'] = info['id']
3061
3062             m = re.search('<description>([^<]+)</description>', coursepage)
3063             if m:
3064                 info['description'] = unescapeHTML(m.group(1))
3065
3066             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3067             info['list'] = [
3068                 {
3069                     'type': 'reference',
3070                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3071                 }
3072                     for vpage in links]
3073             results = []
3074             for entry in info['list']:
3075                 assert entry['type'] == 'reference'
3076                 results += self.extract(entry['url'])
3077             return results
3078         else: # Root page
3079             info = {
3080                 'id': 'Stanford OpenClassroom',
3081                 'type': 'playlist',
3082                 'uploader': None,
3083                 'upload_date': None,
3084             }
3085
3086             self.report_download_webpage(info['id'])
3087             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3088             try:
3089                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3090             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3091                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3092                 return
3093
3094             info['title'] = info['id']
3095
3096             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3097             info['list'] = [
3098                 {
3099                     'type': 'reference',
3100                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3101                 }
3102                     for cpage in links]
3103
3104             results = []
3105             for entry in info['list']:
3106                 assert entry['type'] == 'reference'
3107                 results += self.extract(entry['url'])
3108             return results
3109
3110 class MTVIE(InfoExtractor):
3111     """Information extractor for MTV.com"""
3112
3113     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3114     IE_NAME = u'mtv'
3115
3116     def report_extraction(self, video_id):
3117         """Report information extraction."""
3118         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3119
3120     def _real_extract(self, url):
3121         mobj = re.match(self._VALID_URL, url)
3122         if mobj is None:
3123             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3124             return
3125         if not mobj.group('proto'):
3126             url = 'http://' + url
3127         video_id = mobj.group('videoid')
3128
3129         webpage = self._download_webpage(url, video_id)
3130
3131         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3132         if mobj is None:
3133             self._downloader.trouble(u'ERROR: unable to extract song name')
3134             return
3135         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3136         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3137         if mobj is None:
3138             self._downloader.trouble(u'ERROR: unable to extract performer')
3139             return
3140         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3141         video_title = performer + ' - ' + song_name
3142
3143         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3144         if mobj is None:
3145             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3146             return
3147         mtvn_uri = mobj.group(1)
3148
3149         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3150         if mobj is None:
3151             self._downloader.trouble(u'ERROR: unable to extract content id')
3152             return
3153         content_id = mobj.group(1)
3154
3155         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3156         self.report_extraction(video_id)
3157         request = compat_urllib_request.Request(videogen_url)
3158         try:
3159             metadataXml = compat_urllib_request.urlopen(request).read()
3160         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3161             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3162             return
3163
3164         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3165         renditions = mdoc.findall('.//rendition')
3166
3167         # For now, always pick the highest quality.
3168         rendition = renditions[-1]
3169
3170         try:
3171             _,_,ext = rendition.attrib['type'].partition('/')
3172             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3173             video_url = rendition.find('./src').text
3174         except KeyError:
3175             self._downloader.trouble('Invalid rendition field.')
3176             return
3177
3178         info = {
3179             'id': video_id,
3180             'url': video_url,
3181             'uploader': performer,
3182             'upload_date': None,
3183             'title': video_title,
3184             'ext': ext,
3185             'format': format,
3186         }
3187
3188         return [info]
3189
3190
3191 class YoukuIE(InfoExtractor):
3192     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3193
3194     def report_download_webpage(self, file_id):
3195         """Report webpage download."""
3196         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3197
3198     def report_extraction(self, file_id):
3199         """Report information extraction."""
3200         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3201
3202     def _gen_sid(self):
3203         nowTime = int(time.time() * 1000)
3204         random1 = random.randint(1000,1998)
3205         random2 = random.randint(1000,9999)
3206
3207         return "%d%d%d" %(nowTime,random1,random2)
3208
3209     def _get_file_ID_mix_string(self, seed):
3210         mixed = []
3211         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3212         seed = float(seed)
3213         for i in range(len(source)):
3214             seed  =  (seed * 211 + 30031 ) % 65536
3215             index  =  math.floor(seed / 65536 * len(source) )
3216             mixed.append(source[int(index)])
3217             source.remove(source[int(index)])
3218         #return ''.join(mixed)
3219         return mixed
3220
3221     def _get_file_id(self, fileId, seed):
3222         mixed = self._get_file_ID_mix_string(seed)
3223         ids = fileId.split('*')
3224         realId = []
3225         for ch in ids:
3226             if ch:
3227                 realId.append(mixed[int(ch)])
3228         return ''.join(realId)
3229
3230     def _real_extract(self, url):
3231         mobj = re.match(self._VALID_URL, url)
3232         if mobj is None:
3233             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3234             return
3235         video_id = mobj.group('ID')
3236
3237         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3238
3239         request = compat_urllib_request.Request(info_url, None, std_headers)
3240         try:
3241             self.report_download_webpage(video_id)
3242             jsondata = compat_urllib_request.urlopen(request).read()
3243         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3244             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3245             return
3246
3247         self.report_extraction(video_id)
3248         try:
3249             jsonstr = jsondata.decode('utf-8')
3250             config = json.loads(jsonstr)
3251
3252             video_title =  config['data'][0]['title']
3253             seed = config['data'][0]['seed']
3254
3255             format = self._downloader.params.get('format', None)
3256             supported_format = list(config['data'][0]['streamfileids'].keys())
3257
3258             if format is None or format == 'best':
3259                 if 'hd2' in supported_format:
3260                     format = 'hd2'
3261                 else:
3262                     format = 'flv'
3263                 ext = u'flv'
3264             elif format == 'worst':
3265                 format = 'mp4'
3266                 ext = u'mp4'
3267             else:
3268                 format = 'flv'
3269                 ext = u'flv'
3270
3271
3272             fileid = config['data'][0]['streamfileids'][format]
3273             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3274         except (UnicodeDecodeError, ValueError, KeyError):
3275             self._downloader.trouble(u'ERROR: unable to extract info section')
3276             return
3277
3278         files_info=[]
3279         sid = self._gen_sid()
3280         fileid = self._get_file_id(fileid, seed)
3281
3282         #column 8,9 of fileid represent the segment number
3283         #fileid[7:9] should be changed
3284         for index, key in enumerate(keys):
3285
3286             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3287             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3288
3289             info = {
3290                 'id': '%s_part%02d' % (video_id, index),
3291                 'url': download_url,
3292                 'uploader': None,
3293                 'upload_date': None,
3294                 'title': video_title,
3295                 'ext': ext,
3296             }
3297             files_info.append(info)
3298
3299         return files_info
3300
3301
3302 class XNXXIE(InfoExtractor):
3303     """Information extractor for xnxx.com"""
3304
3305     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3306     IE_NAME = u'xnxx'
3307     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3308     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3309     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3310
3311     def report_webpage(self, video_id):
3312         """Report information extraction"""
3313         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3314
3315     def report_extraction(self, video_id):
3316         """Report information extraction"""
3317         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3318
3319     def _real_extract(self, url):
3320         mobj = re.match(self._VALID_URL, url)
3321         if mobj is None:
3322             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3323             return
3324         video_id = mobj.group(1)
3325
3326         self.report_webpage(video_id)
3327
3328         # Get webpage content
3329         try:
3330             webpage_bytes = compat_urllib_request.urlopen(url).read()
3331             webpage = webpage_bytes.decode('utf-8')
3332         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3333             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3334             return
3335
3336         result = re.search(self.VIDEO_URL_RE, webpage)
3337         if result is None:
3338             self._downloader.trouble(u'ERROR: unable to extract video url')
3339             return
3340         video_url = compat_urllib_parse.unquote(result.group(1))
3341
3342         result = re.search(self.VIDEO_TITLE_RE, webpage)
3343         if result is None:
3344             self._downloader.trouble(u'ERROR: unable to extract video title')
3345             return
3346         video_title = result.group(1)
3347
3348         result = re.search(self.VIDEO_THUMB_RE, webpage)
3349         if result is None:
3350             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3351             return
3352         video_thumbnail = result.group(1)
3353
3354         return [{
3355             'id': video_id,
3356             'url': video_url,
3357             'uploader': None,
3358             'upload_date': None,
3359             'title': video_title,
3360             'ext': 'flv',
3361             'thumbnail': video_thumbnail,
3362             'description': None,
3363         }]
3364
3365
3366 class GooglePlusIE(InfoExtractor):
3367     """Information extractor for plus.google.com."""
3368
3369     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3370     IE_NAME = u'plus.google'
3371
3372     def __init__(self, downloader=None):
3373         InfoExtractor.__init__(self, downloader)
3374
3375     def report_extract_entry(self, url):
3376         """Report downloading extry"""
3377         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3378
3379     def report_date(self, upload_date):
3380         """Report downloading extry"""
3381         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3382
3383     def report_uploader(self, uploader):
3384         """Report downloading extry"""
3385         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3386
3387     def report_title(self, video_title):
3388         """Report downloading extry"""
3389         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3390
3391     def report_extract_vid_page(self, video_page):
3392         """Report information extraction."""
3393         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3394
3395     def _real_extract(self, url):
3396         # Extract id from URL
3397         mobj = re.match(self._VALID_URL, url)
3398         if mobj is None:
3399             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3400             return
3401
3402         post_url = mobj.group(0)
3403         video_id = mobj.group(1)
3404
3405         video_extension = 'flv'
3406
3407         # Step 1, Retrieve post webpage to extract further information
3408         self.report_extract_entry(post_url)
3409         request = compat_urllib_request.Request(post_url)
3410         try:
3411             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3412         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3413             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3414             return
3415
3416         # Extract update date
3417         upload_date = None
3418         pattern = 'title="Timestamp">(.*?)</a>'
3419         mobj = re.search(pattern, webpage)
3420         if mobj:
3421             upload_date = mobj.group(1)
3422             # Convert timestring to a format suitable for filename
3423             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3424             upload_date = upload_date.strftime('%Y%m%d')
3425         self.report_date(upload_date)
3426
3427         # Extract uploader
3428         uploader = None
3429         pattern = r'rel\="author".*?>(.*?)</a>'
3430         mobj = re.search(pattern, webpage)
3431         if mobj:
3432             uploader = mobj.group(1)
3433         self.report_uploader(uploader)
3434
3435         # Extract title
3436         # Get the first line for title
3437         video_title = u'NA'
3438         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3439         mobj = re.search(pattern, webpage)
3440         if mobj:
3441             video_title = mobj.group(1)
3442         self.report_title(video_title)
3443
3444         # Step 2, Stimulate clicking the image box to launch video
3445         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3446         mobj = re.search(pattern, webpage)
3447         if mobj is None:
3448             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3449
3450         video_page = mobj.group(1)
3451         request = compat_urllib_request.Request(video_page)
3452         try:
3453             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3454         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3455             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3456             return
3457         self.report_extract_vid_page(video_page)
3458
3459
3460         # Extract video links on video page
3461         """Extract video links of all sizes"""
3462         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3463         mobj = re.findall(pattern, webpage)
3464         if len(mobj) == 0:
3465             self._downloader.trouble(u'ERROR: unable to extract video links')
3466
3467         # Sort in resolution
3468         links = sorted(mobj)
3469
3470         # Choose the lowest of the sort, i.e. highest resolution
3471         video_url = links[-1]
3472         # Only get the url. The resolution part in the tuple has no use anymore
3473         video_url = video_url[-1]
3474         # Treat escaped \u0026 style hex
3475         try:
3476             video_url = video_url.decode("unicode_escape")
3477         except AttributeError: # Python 3
3478             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3479
3480
3481         return [{
3482             'id':       video_id,
3483             'url':      video_url,
3484             'uploader': uploader,
3485             'upload_date':  upload_date,
3486             'title':    video_title,
3487             'ext':      video_extension,
3488         }]
3489
3490 class NBAIE(InfoExtractor):
3491     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3492     IE_NAME = u'nba'
3493
3494     def _real_extract(self, url):
3495         mobj = re.match(self._VALID_URL, url)
3496         if mobj is None:
3497             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3498             return
3499
3500         video_id = mobj.group(1)
3501         if video_id.endswith('/index.html'):
3502             video_id = video_id[:-len('/index.html')]
3503
3504         webpage = self._download_webpage(url, video_id)
3505
3506         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3507         def _findProp(rexp, default=None):
3508             m = re.search(rexp, webpage)
3509             if m:
3510                 return unescapeHTML(m.group(1))
3511             else:
3512                 return default
3513
3514         shortened_video_id = video_id.rpartition('/')[2]
3515         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3516         info = {
3517             'id': shortened_video_id,
3518             'url': video_url,
3519             'ext': 'mp4',
3520             'title': title,
3521             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3522             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3523         }
3524         return [info]
3525
3526 class JustinTVIE(InfoExtractor):
3527     """Information extractor for justin.tv and twitch.tv"""
3528     # TODO: One broadcast may be split into multiple videos. The key
3529     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3530     # starts at 1 and increases. Can we treat all parts as one video?
3531
3532     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3533         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3534     _JUSTIN_PAGE_LIMIT = 100
3535     IE_NAME = u'justin.tv'
3536
3537     def report_extraction(self, file_id):
3538         """Report information extraction."""
3539         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3540
3541     def report_download_page(self, channel, offset):
3542         """Report attempt to download a single page of videos."""
3543         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3544                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3545
3546     # Return count of items, list of *valid* items
3547     def _parse_page(self, url):
3548         try:
3549             urlh = compat_urllib_request.urlopen(url)
3550             webpage_bytes = urlh.read()
3551             webpage = webpage_bytes.decode('utf-8', 'ignore')
3552         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3553             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3554             return
3555
3556         response = json.loads(webpage)
3557         if type(response) != list:
3558             error_text = response.get('error', 'unknown error')
3559             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3560             return
3561         info = []
3562         for clip in response:
3563             video_url = clip['video_file_url']
3564             if video_url:
3565                 video_extension = os.path.splitext(video_url)[1][1:]
3566                 video_date = re.sub('-', '', clip['start_time'][:10])
3567                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3568                 video_id = clip['id']
3569                 video_title = clip.get('title', video_id)
3570                 info.append({
3571                     'id': video_id,
3572                     'url': video_url,
3573                     'title': video_title,
3574                     'uploader': clip.get('channel_name', video_uploader_id),
3575                     'uploader_id': video_uploader_id,
3576                     'upload_date': video_date,
3577                     'ext': video_extension,
3578                 })
3579         return (len(response), info)
3580
3581     def _real_extract(self, url):
3582         mobj = re.match(self._VALID_URL, url)
3583         if mobj is None:
3584             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3585             return
3586
3587         api = 'http://api.justin.tv'
3588         video_id = mobj.group(mobj.lastindex)
3589         paged = False
3590         if mobj.lastindex == 1:
3591             paged = True
3592             api += '/channel/archives/%s.json'
3593         else:
3594             api += '/broadcast/by_archive/%s.json'
3595         api = api % (video_id,)
3596
3597         self.report_extraction(video_id)
3598
3599         info = []
3600         offset = 0
3601         limit = self._JUSTIN_PAGE_LIMIT
3602         while True:
3603             if paged:
3604                 self.report_download_page(video_id, offset)
3605             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3606             page_count, page_info = self._parse_page(page_url)
3607             info.extend(page_info)
3608             if not paged or page_count != limit:
3609                 break
3610             offset += limit
3611         return info
3612
3613 class FunnyOrDieIE(InfoExtractor):
3614     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3615
3616     def _real_extract(self, url):
3617         mobj = re.match(self._VALID_URL, url)
3618         if mobj is None:
3619             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3620             return
3621
3622         video_id = mobj.group('id')
3623         webpage = self._download_webpage(url, video_id)
3624
3625         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3626         if not m:
3627             self._downloader.trouble(u'ERROR: unable to find video information')
3628         video_url = unescapeHTML(m.group('url'))
3629
3630         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3631         if not m:
3632             self._downloader.trouble(u'Cannot find video title')
3633         title = unescapeHTML(m.group('title'))
3634
3635         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3636         if m:
3637             desc = unescapeHTML(m.group('desc'))
3638         else:
3639             desc = None
3640
3641         info = {
3642             'id': video_id,
3643             'url': video_url,
3644             'ext': 'mp4',
3645             'title': title,
3646             'description': desc,
3647         }
3648         return [info]
3649
3650 class SteamIE(InfoExtractor):
3651     _VALID_URL = r"""http://store.steampowered.com/
3652                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3653                 (?P<gameID>\d+)/?
3654                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3655                 """
3656
3657     @classmethod
3658     def suitable(cls, url):
3659         """Receives a URL and returns True if suitable for this IE."""
3660         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3661
3662     def _real_extract(self, url):
3663         m = re.match(self._VALID_URL, url, re.VERBOSE)
3664         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3665         gameID = m.group('gameID')
3666         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3667         webpage = self._download_webpage(videourl, gameID)
3668         mweb = re.finditer(urlRE, webpage)
3669         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3670         titles = re.finditer(namesRE, webpage)
3671         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3672         thumbs = re.finditer(thumbsRE, webpage)
3673         videos = []
3674         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3675             video_id = vid.group('videoID')
3676             title = vtitle.group('videoName')
3677             video_url = vid.group('videoURL')
3678             video_thumb = thumb.group('thumbnail')
3679             if not video_url:
3680                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3681             info = {
3682                 'id':video_id,
3683                 'url':video_url,
3684                 'ext': 'flv',
3685                 'title': unescapeHTML(title),
3686                 'thumbnail': video_thumb
3687                   }
3688             videos.append(info)
3689         return videos
3690
3691 class UstreamIE(InfoExtractor):
3692     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3693     IE_NAME = u'ustream'
3694
3695     def _real_extract(self, url):
3696         m = re.match(self._VALID_URL, url)
3697         video_id = m.group('videoID')
3698         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3699         webpage = self._download_webpage(url, video_id)
3700         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3701         title = m.group('title')
3702         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3703         uploader = m.group('uploader')
3704         info = {
3705                 'id':video_id,
3706                 'url':video_url,
3707                 'ext': 'flv',
3708                 'title': title,
3709                 'uploader': uploader
3710                   }
3711         return [info]
3712
3713 class RBMARadioIE(InfoExtractor):
3714     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3715
3716     def _real_extract(self, url):
3717         m = re.match(self._VALID_URL, url)
3718         video_id = m.group('videoID')
3719
3720         webpage = self._download_webpage(url, video_id)
3721         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3722         if not m:
3723             raise ExtractorError(u'Cannot find metadata')
3724         json_data = m.group(1)
3725
3726         try:
3727             data = json.loads(json_data)
3728         except ValueError as e:
3729             raise ExtractorError(u'Invalid JSON: ' + str(e))
3730
3731         video_url = data['akamai_url'] + '&cbr=256'
3732         url_parts = compat_urllib_parse_urlparse(video_url)
3733         video_ext = url_parts.path.rpartition('.')[2]
3734         info = {
3735                 'id': video_id,
3736                 'url': video_url,
3737                 'ext': video_ext,
3738                 'title': data['title'],
3739                 'description': data.get('teaser_text'),
3740                 'location': data.get('country_of_origin'),
3741                 'uploader': data.get('host', {}).get('name'),
3742                 'uploader_id': data.get('host', {}).get('slug'),
3743                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3744                 'duration': data.get('duration'),
3745         }
3746         return [info]
3747
3748
3749 class YouPornIE(InfoExtractor):
3750     """Information extractor for youporn.com."""
3751     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3752
3753     def _print_formats(self, formats):
3754         """Print all available formats"""
3755         print(u'Available formats:')
3756         print(u'ext\t\tformat')
3757         print(u'---------------------------------')
3758         for format in formats:
3759             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3760
3761     def _specific(self, req_format, formats):
3762         for x in formats:
3763             if(x["format"]==req_format):
3764                 return x
3765         return None
3766
3767     def _real_extract(self, url):
3768         mobj = re.match(self._VALID_URL, url)
3769         if mobj is None:
3770             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3771             return
3772
3773         video_id = mobj.group('videoid')
3774
3775         req = compat_urllib_request.Request(url)
3776         req.add_header('Cookie', 'age_verified=1')
3777         webpage = self._download_webpage(req, video_id)
3778
3779         # Get the video title
3780         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3781         if result is None:
3782             raise ExtractorError(u'Unable to extract video title')
3783         video_title = result.group('title').strip()
3784
3785         # Get the video date
3786         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3787         if result is None:
3788             self._downloader.report_warning(u'unable to extract video date')
3789             upload_date = None
3790         else:
3791             upload_date = result.group('date').strip()
3792
3793         # Get the video uploader
3794         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3795         if result is None:
3796             self._downloader.report_warning(u'unable to extract uploader')
3797             video_uploader = None
3798         else:
3799             video_uploader = result.group('uploader').strip()
3800             video_uploader = clean_html( video_uploader )
3801
3802         # Get all of the formats available
3803         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3804         result = re.search(DOWNLOAD_LIST_RE, webpage)
3805         if result is None:
3806             raise ExtractorError(u'Unable to extract download list')
3807         download_list_html = result.group('download_list').strip()
3808
3809         # Get all of the links from the page
3810         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3811         links = re.findall(LINK_RE, download_list_html)
3812         if(len(links) == 0):
3813             raise ExtractorError(u'ERROR: no known formats available for video')
3814
3815         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3816
3817         formats = []
3818         for link in links:
3819
3820             # A link looks like this:
3821             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3822             # A path looks like this:
3823             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3824             video_url = unescapeHTML( link )
3825             path = compat_urllib_parse_urlparse( video_url ).path
3826             extension = os.path.splitext( path )[1][1:]
3827             format = path.split('/')[4].split('_')[:2]
3828             size = format[0]
3829             bitrate = format[1]
3830             format = "-".join( format )
3831             title = u'%s-%s-%s' % (video_title, size, bitrate)
3832
3833             formats.append({
3834                 'id': video_id,
3835                 'url': video_url,
3836                 'uploader': video_uploader,
3837                 'upload_date': upload_date,
3838                 'title': title,
3839                 'ext': extension,
3840                 'format': format,
3841                 'thumbnail': None,
3842                 'description': None,
3843                 'player_url': None
3844             })
3845
3846         if self._downloader.params.get('listformats', None):
3847             self._print_formats(formats)
3848             return
3849
3850         req_format = self._downloader.params.get('format', None)
3851         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3852
3853         if req_format is None or req_format == 'best':
3854             return [formats[0]]
3855         elif req_format == 'worst':
3856             return [formats[-1]]
3857         elif req_format in ('-1', 'all'):
3858             return formats
3859         else:
3860             format = self._specific( req_format, formats )
3861             if result is None:
3862                 self._downloader.trouble(u'ERROR: requested format not available')
3863                 return
3864             return [format]
3865
3866
3867
3868 class PornotubeIE(InfoExtractor):
3869     """Information extractor for pornotube.com."""
3870     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3871
3872     def _real_extract(self, url):
3873         mobj = re.match(self._VALID_URL, url)
3874         if mobj is None:
3875             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3876             return
3877
3878         video_id = mobj.group('videoid')
3879         video_title = mobj.group('title')
3880
3881         # Get webpage content
3882         webpage = self._download_webpage(url, video_id)
3883
3884         # Get the video URL
3885         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3886         result = re.search(VIDEO_URL_RE, webpage)
3887         if result is None:
3888             self._downloader.trouble(u'ERROR: unable to extract video url')
3889             return
3890         video_url = compat_urllib_parse.unquote(result.group('url'))
3891
3892         #Get the uploaded date
3893         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3894         result = re.search(VIDEO_UPLOADED_RE, webpage)
3895         if result is None:
3896             self._downloader.trouble(u'ERROR: unable to extract video title')
3897             return
3898         upload_date = result.group('date')
3899
3900         info = {'id': video_id,
3901                 'url': video_url,
3902                 'uploader': None,
3903                 'upload_date': upload_date,
3904                 'title': video_title,
3905                 'ext': 'flv',
3906                 'format': 'flv'}
3907
3908         return [info]
3909
3910 class YouJizzIE(InfoExtractor):
3911     """Information extractor for youjizz.com."""
3912     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3913
3914     def _real_extract(self, url):
3915         mobj = re.match(self._VALID_URL, url)
3916         if mobj is None:
3917             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3918             return
3919
3920         video_id = mobj.group('videoid')
3921
3922         # Get webpage content
3923         webpage = self._download_webpage(url, video_id)
3924
3925         # Get the video title
3926         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3927         if result is None:
3928             raise ExtractorError(u'ERROR: unable to extract video title')
3929         video_title = result.group('title').strip()
3930
3931         # Get the embed page
3932         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3933         if result is None:
3934             raise ExtractorError(u'ERROR: unable to extract embed page')
3935
3936         embed_page_url = result.group(0).strip()
3937         video_id = result.group('videoid')
3938
3939         webpage = self._download_webpage(embed_page_url, video_id)
3940
3941         # Get the video URL
3942         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3943         if result is None:
3944             raise ExtractorError(u'ERROR: unable to extract video url')
3945         video_url = result.group('source')
3946
3947         info = {'id': video_id,
3948                 'url': video_url,
3949                 'title': video_title,
3950                 'ext': 'flv',
3951                 'format': 'flv',
3952                 'player_url': embed_page_url}
3953
3954         return [info]
3955
3956 class EightTracksIE(InfoExtractor):
3957     IE_NAME = '8tracks'
3958     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3959
3960     def _real_extract(self, url):
3961         mobj = re.match(self._VALID_URL, url)
3962         if mobj is None:
3963             raise ExtractorError(u'Invalid URL: %s' % url)
3964         playlist_id = mobj.group('id')
3965
3966         webpage = self._download_webpage(url, playlist_id)
3967
3968         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3969         if not m:
3970             raise ExtractorError(u'Cannot find trax information')
3971         json_like = m.group(1)
3972         data = json.loads(json_like)
3973
3974         session = str(random.randint(0, 1000000000))
3975         mix_id = data['id']
3976         track_count = data['tracks_count']
3977         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3978         next_url = first_url
3979         res = []
3980         for i in itertools.count():
3981             api_json = self._download_webpage(next_url, playlist_id,
3982                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3983                 errnote=u'Failed to download song information')
3984             api_data = json.loads(api_json)
3985             track_data = api_data[u'set']['track']
3986             info = {
3987                 'id': track_data['id'],
3988                 'url': track_data['track_file_stream_url'],
3989                 'title': track_data['performer'] + u' - ' + track_data['name'],
3990                 'raw_title': track_data['name'],
3991                 'uploader_id': data['user']['login'],
3992                 'ext': 'm4a',
3993             }
3994             res.append(info)
3995             if api_data['set']['at_last_track']:
3996                 break
3997             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3998         return res
3999
4000 class KeekIE(InfoExtractor):
4001     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4002     IE_NAME = u'keek'
4003
4004     def _real_extract(self, url):
4005         m = re.match(self._VALID_URL, url)
4006         video_id = m.group('videoID')
4007         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4008         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4009         webpage = self._download_webpage(url, video_id)
4010         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4011         title = unescapeHTML(m.group('title'))
4012         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4013         uploader = clean_html(m.group('uploader'))
4014         info = {
4015                 'id': video_id,
4016                 'url': video_url,
4017                 'ext': 'mp4',
4018                 'title': title,
4019                 'thumbnail': thumbnail,
4020                 'uploader': uploader
4021         }
4022         return [info]
4023
4024 class TEDIE(InfoExtractor):
4025     _VALID_URL=r'''http://www.ted.com/
4026                    (
4027                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4028                         |
4029                         ((?P<type_talk>talks)) # We have a simple talk
4030                    )
4031                    /(?P<name>\w+) # Here goes the name and then ".html"
4032                    '''
4033
4034     @classmethod
4035     def suitable(cls, url):
4036         """Receives a URL and returns True if suitable for this IE."""
4037         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4038
4039     def _real_extract(self, url):
4040         m=re.match(self._VALID_URL, url, re.VERBOSE)
4041         if m.group('type_talk'):
4042             return [self._talk_info(url)]
4043         else :
4044             playlist_id=m.group('playlist_id')
4045             name=m.group('name')
4046             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4047             return self._playlist_videos_info(url,name,playlist_id)
4048
4049     def _talk_video_link(self,mediaSlug):
4050         '''Returns the video link for that mediaSlug'''
4051         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4052
4053     def _playlist_videos_info(self,url,name,playlist_id=0):
4054         '''Returns the videos of the playlist'''
4055         video_RE=r'''
4056                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4057                      ([.\s]*?)data-playlist_item_id="(\d+)"
4058                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4059                      '''
4060         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4061         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4062         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4063         m_names=re.finditer(video_name_RE,webpage)
4064         info=[]
4065         for m_video, m_name in zip(m_videos,m_names):
4066             video_id=m_video.group('video_id')
4067             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4068             info.append(self._talk_info(talk_url,video_id))
4069         return info
4070
4071     def _talk_info(self, url, video_id=0):
4072         """Return the video for the talk in the url"""
4073         m=re.match(self._VALID_URL, url,re.VERBOSE)
4074         videoName=m.group('name')
4075         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4076         # If the url includes the language we get the title translated
4077         title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4078         title=re.search(title_RE, webpage).group('title')
4079         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4080                         "id":(?P<videoID>[\d]+).*?
4081                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4082         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4083         thumb_match=re.search(thumb_RE,webpage)
4084         info_match=re.search(info_RE,webpage,re.VERBOSE)
4085         video_id=info_match.group('videoID')
4086         mediaSlug=info_match.group('mediaSlug')
4087         video_url=self._talk_video_link(mediaSlug)
4088         info = {
4089                 'id': video_id,
4090                 'url': video_url,
4091                 'ext': 'mp4',
4092                 'title': title,
4093                 'thumbnail': thumb_match.group('thumbnail')
4094                 }
4095         return info
4096
4097 class MySpassIE(InfoExtractor):
4098     _VALID_URL = r'http://www.myspass.de/.*'
4099
4100     def _real_extract(self, url):
4101         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4102
4103         # video id is the last path element of the URL
4104         # usually there is a trailing slash, so also try the second but last
4105         url_path = compat_urllib_parse_urlparse(url).path
4106         url_parent_path, video_id = os.path.split(url_path)
4107         if not video_id:
4108             _, video_id = os.path.split(url_parent_path)
4109
4110         # get metadata
4111         metadata_url = META_DATA_URL_TEMPLATE % video_id
4112         metadata_text = self._download_webpage(metadata_url, video_id)
4113         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4114
4115         # extract values from metadata
4116         url_flv_el = metadata.find('url_flv')
4117         if url_flv_el is None:
4118             self._downloader.trouble(u'ERROR: unable to extract download url')
4119             return
4120         video_url = url_flv_el.text
4121         extension = os.path.splitext(video_url)[1][1:]
4122         title_el = metadata.find('title')
4123         if title_el is None:
4124             self._downloader.trouble(u'ERROR: unable to extract title')
4125             return
4126         title = title_el.text
4127         format_id_el = metadata.find('format_id')
4128         if format_id_el is None:
4129             format = ext
4130         else:
4131             format = format_id_el.text
4132         description_el = metadata.find('description')
4133         if description_el is not None:
4134             description = description_el.text
4135         else:
4136             description = None
4137         imagePreview_el = metadata.find('imagePreview')
4138         if imagePreview_el is not None:
4139             thumbnail = imagePreview_el.text
4140         else:
4141             thumbnail = None
4142         info = {
4143             'id': video_id,
4144             'url': video_url,
4145             'title': title,
4146             'ext': extension,
4147             'format': format,
4148             'thumbnail': thumbnail,
4149             'description': description
4150         }
4151         return [info]
4152
4153 class SpiegelIE(InfoExtractor):
4154     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?$'
4155
4156     def _real_extract(self, url):
4157         m = re.match(self._VALID_URL, url)
4158         video_id = m.group('videoID')
4159
4160         webpage = self._download_webpage(url, video_id)
4161         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4162         if not m:
4163             raise ExtractorError(u'Cannot find title')
4164         video_title = unescapeHTML(m.group(1))
4165
4166         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4167         xml_code = self._download_webpage(xml_url, video_id,
4168                     note=u'Downloading XML', errnote=u'Failed to download XML')
4169
4170         idoc = xml.etree.ElementTree.fromstring(xml_code)
4171         last_type = idoc[-1]
4172         filename = last_type.findall('./filename')[0].text
4173         duration = float(last_type.findall('./duration')[0].text)
4174
4175         video_url = 'http://video2.spiegel.de/flash/' + filename
4176         video_ext = filename.rpartition('.')[2]
4177         info = {
4178             'id': video_id,
4179             'url': video_url,
4180             'ext': video_ext,
4181             'title': video_title,
4182             'duration': duration,
4183         }
4184         return [info]
4185
4186
4187 def gen_extractors():
4188     """ Return a list of an instance of every supported extractor.
4189     The order does matter; the first extractor matched is the one handling the URL.
4190     """
4191     return [
4192         YoutubePlaylistIE(),
4193         YoutubeChannelIE(),
4194         YoutubeUserIE(),
4195         YoutubeSearchIE(),
4196         YoutubeIE(),
4197         MetacafeIE(),
4198         DailymotionIE(),
4199         GoogleSearchIE(),
4200         PhotobucketIE(),
4201         YahooIE(),
4202         YahooSearchIE(),
4203         DepositFilesIE(),
4204         FacebookIE(),
4205         BlipTVUserIE(),
4206         BlipTVIE(),
4207         VimeoIE(),
4208         MyVideoIE(),
4209         ComedyCentralIE(),
4210         EscapistIE(),
4211         CollegeHumorIE(),
4212         XVideosIE(),
4213         SoundcloudIE(),
4214         InfoQIE(),
4215         MixcloudIE(),
4216         StanfordOpenClassroomIE(),
4217         MTVIE(),
4218         YoukuIE(),
4219         XNXXIE(),
4220         YouJizzIE(),
4221         PornotubeIE(),
4222         YouPornIE(),
4223         GooglePlusIE(),
4224         ArteTvIE(),
4225         NBAIE(),
4226         JustinTVIE(),
4227         FunnyOrDieIE(),
4228         SteamIE(),
4229         UstreamIE(),
4230         RBMARadioIE(),
4231         EightTracksIE(),
4232         KeekIE(),
4233         TEDIE(),
4234         MySpassIE(),
4235         SpiegelIE(),
4236         GenericIE()
4237     ]
4238
4239