youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18
  19 from .utils import *
  20
  21
  22 class InfoExtractor(object):
  23     """Information Extractor class.
  24
  25     Information extractors are the classes that, given a URL, extract
  26     information about the video (or videos) the URL refers to. This
  27     information includes the real video URL, the video title, author and
  28     others. The information is stored in a dictionary which is then
  29     passed to the FileDownloader. The FileDownloader processes this
  30     information possibly downloading the video to the file system, among
  31     other possible outcomes.
  32
  33     The dictionaries must include the following fields:
  34
  35     id:             Video identifier.
  36     url:            Final video URL.
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader:       Full name of the video uploader.
  46     upload_date:    Video upload date (YYYYMMDD).
  47     uploader_id:    Nickname or id of the video uploader.
  48     location:       Physical location of the video.
  49     player_url:     SWF Player URL (used for rtmpdump).
  50     subtitles:      The subtitle file contents.
  51     urlhandle:      [internal] The urlHandle to be used to download the file,
  52                     like returned by urllib.request.urlopen
  53
  54     The fields should all be Unicode strings.
  55
  56     Subclasses of this one should re-define the _real_initialize() and
  57     _real_extract() methods and define a _VALID_URL regexp.
  58     Probably, they should also be added to the list of extractors.
  59
  60     _real_extract() must return a *list* of information dictionaries as
  61     described above.
  62
  63     Finally, the _WORKING attribute should be set to False for broken IEs
  64     in order to warn the users and skip the tests.
  65     """
  66
  67     _ready = False
  68     _downloader = None
  69     _WORKING = True
  70
  71     def __init__(self, downloader=None):
  72         """Constructor. Receives an optional downloader."""
  73         self._ready = False
  74         self.set_downloader(downloader)
  75
  76     def suitable(self, url):
  77         """Receives a URL and returns True if suitable for this IE."""
  78         return re.match(self._VALID_URL, url) is not None
  79
  80     def working(self):
  81         """Getter method for _WORKING."""
  82         return self._WORKING
  83
  84     def initialize(self):
  85         """Initializes an instance (authentication, etc)."""
  86         if not self._ready:
  87             self._real_initialize()
  88             self._ready = True
  89
  90     def extract(self, url):
  91         """Extracts URL information and returns it in list of dicts."""
  92         self.initialize()
  93         return self._real_extract(url)
  94
  95     def set_downloader(self, downloader):
  96         """Sets the downloader for this IE."""
  97         self._downloader = downloader
  98
  99     def _real_initialize(self):
 100         """Real initialization process. Redefine in subclasses."""
 101         pass
 102
 103     def _real_extract(self, url):
 104         """Real extraction process. Redefine in subclasses."""
 105         pass
 106
 107     @property
 108     def IE_NAME(self):
 109         return type(self).__name__[:-2]
 110
 111     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 112         """ Returns the response handle """
 113         if note is None:
 114             note = u'Downloading video webpage'
 115         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 116         try:
 117             return compat_urllib_request.urlopen(url_or_request)
 118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 119             if errnote is None:
 120                 errnote = u'Unable to download webpage'
 121             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 122
 123     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 124         """ Returns the data of the page as a string """
 125         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 126         webpage_bytes = urlh.read()
 127         return webpage_bytes.decode('utf-8', 'replace')
 128
 129
 130 class YoutubeIE(InfoExtractor):
 131     """Information extractor for youtube.com."""
 132
 133     _VALID_URL = r"""^
 134                      (
 135                          (?:https?://)?                                       # http(s):// (optional)
 136                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 137                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 138                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 139                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 140                          (?:                                                  # the various things that can precede the ID:
 141                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 142                              |(?:                                             # or the v= param in all its forms
 143                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 144                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 145                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 146                                  v=
 147                              )
 148                          )?                                                   # optional -> youtube.com/xxxx is OK
 149                      )?                                                       # all until now is optional -> you can pass the naked ID
 150                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 151                      (?(1).+)?                                                # if we found the ID, everything can follow
 152                      $"""
 153     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 154     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 155     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 156     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 157     _NETRC_MACHINE = 'youtube'
 158     # Listed in order of quality
 159     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 160     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 161     _video_extensions = {
 162         '13': '3gp',
 163         '17': 'mp4',
 164         '18': 'mp4',
 165         '22': 'mp4',
 166         '37': 'mp4',
 167         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 168         '43': 'webm',
 169         '44': 'webm',
 170         '45': 'webm',
 171         '46': 'webm',
 172     }
 173     _video_dimensions = {
 174         '5': '240x400',
 175         '6': '???',
 176         '13': '???',
 177         '17': '144x176',
 178         '18': '360x640',
 179         '22': '720x1280',
 180         '34': '360x640',
 181         '35': '480x854',
 182         '37': '1080x1920',
 183         '38': '3072x4096',
 184         '43': '360x640',
 185         '44': '480x854',
 186         '45': '720x1280',
 187         '46': '1080x1920',
 188     }
 189     IE_NAME = u'youtube'
 190
 191     def suitable(self, url):
 192         """Receives a URL and returns True if suitable for this IE."""
 193         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 194
 195     def report_lang(self):
 196         """Report attempt to set language."""
 197         self._downloader.to_screen(u'[youtube] Setting language')
 198
 199     def report_login(self):
 200         """Report attempt to log in."""
 201         self._downloader.to_screen(u'[youtube] Logging in')
 202
 203     def report_age_confirmation(self):
 204         """Report attempt to confirm age."""
 205         self._downloader.to_screen(u'[youtube] Confirming age')
 206
 207     def report_video_webpage_download(self, video_id):
 208         """Report attempt to download video webpage."""
 209         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 210
 211     def report_video_info_webpage_download(self, video_id):
 212         """Report attempt to download video info webpage."""
 213         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 214
 215     def report_video_subtitles_download(self, video_id):
 216         """Report attempt to download video info webpage."""
 217         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
 218
 219     def report_video_subtitles_request(self, video_id, sub_lang, format):
 220         """Report attempt to download video info webpage."""
 221         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 222
 223     def report_video_subtitles_available(self, video_id, sub_lang_list):
 224         """Report available subtitles."""
 225         sub_lang = ",".join(list(sub_lang_list.keys()))
 226         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
 227
 228     def report_information_extraction(self, video_id):
 229         """Report attempt to extract video information."""
 230         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 231
 232     def report_unavailable_format(self, video_id, format):
 233         """Report extracted video URL."""
 234         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 235
 236     def report_rtmp_download(self):
 237         """Indicate the download will use the RTMP protocol."""
 238         self._downloader.to_screen(u'[youtube] RTMP download detected')
 239
 240     def _get_available_subtitles(self, video_id):
 241         self.report_video_subtitles_download(video_id)
 242         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 243         try:
 244             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 245         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 246             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 247         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 248         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 249         if not sub_lang_list:
 250             return (u'WARNING: video doesn\'t have subtitles', None)
 251         return sub_lang_list
 252
 253     def _list_available_subtitles(self, video_id):
 254         sub_lang_list = self._get_available_subtitles(video_id)
 255         self.report_video_subtitles_available(video_id, sub_lang_list)
 256
 257     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 258         self.report_video_subtitles_request(video_id, sub_lang, format)
 259         params = compat_urllib_parse.urlencode({
 260             'lang': sub_lang,
 261             'name': sub_name,
 262             'v': video_id,
 263             'fmt': format,
 264         })
 265         url = 'http://www.youtube.com/api/timedtext?' + params
 266         try:
 267             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 268         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 269             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 270         if not sub:
 271             return (u'WARNING: Did not fetch video subtitles', None)
 272         return (None, sub_lang, sub)
 273
 274     def _extract_subtitle(self, video_id):
 275         sub_lang_list = self._get_available_subtitles(video_id)
 276         sub_format = self._downloader.params.get('subtitlesformat')
 277         if self._downloader.params.get('subtitleslang', False):
 278             sub_lang = self._downloader.params.get('subtitleslang')
 279         elif 'en' in sub_lang_list:
 280             sub_lang = 'en'
 281         else:
 282             sub_lang = list(sub_lang_list.keys())[0]
 283         if not sub_lang in sub_lang_list:
 284             return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None)
 285
 286         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 287         return [subtitle]
 288
 289     def _extract_all_subtitles(self, video_id):
 290         sub_lang_list = self._get_available_subtitles(video_id)
 291         sub_format = self._downloader.params.get('subtitlesformat')
 292         subtitles = []
 293         for sub_lang in sub_lang_list:
 294             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 295             subtitles.append(subtitle)
 296         return subtitles
 297
 298     def _print_formats(self, formats):
 299         print('Available formats:')
 300         for x in formats:
 301             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 302
 303     def _real_initialize(self):
 304         if self._downloader is None:
 305             return
 306
 307         username = None
 308         password = None
 309         downloader_params = self._downloader.params
 310
 311         # Attempt to use provided username and password or .netrc data
 312         if downloader_params.get('username', None) is not None:
 313             username = downloader_params['username']
 314             password = downloader_params['password']
 315         elif downloader_params.get('usenetrc', False):
 316             try:
 317                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 318                 if info is not None:
 319                     username = info[0]
 320                     password = info[2]
 321                 else:
 322                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 323             except (IOError, netrc.NetrcParseError) as err:
 324                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 325                 return
 326
 327         # Set language
 328         request = compat_urllib_request.Request(self._LANG_URL)
 329         try:
 330             self.report_lang()
 331             compat_urllib_request.urlopen(request).read()
 332         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 333             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 334             return
 335
 336         # No authentication to be performed
 337         if username is None:
 338             return
 339
 340         request = compat_urllib_request.Request(self._LOGIN_URL)
 341         try:
 342             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 343         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 344             self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
 345             return
 346
 347         galx = None
 348         dsh = None
 349         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 350         if match:
 351           galx = match.group(1)
 352
 353         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 354         if match:
 355           dsh = match.group(1)
 356
 357         # Log in
 358         login_form_strs = {
 359                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 360                 u'Email': username,
 361                 u'GALX': galx,
 362                 u'Passwd': password,
 363                 u'PersistentCookie': u'yes',
 364                 u'_utf8': u'霱',
 365                 u'bgresponse': u'js_disabled',
 366                 u'checkConnection': u'',
 367                 u'checkedDomains': u'youtube',
 368                 u'dnConn': u'',
 369                 u'dsh': dsh,
 370                 u'pstMsg': u'0',
 371                 u'rmShown': u'1',
 372                 u'secTok': u'',
 373                 u'signIn': u'Sign in',
 374                 u'timeStmp': u'',
 375                 u'service': u'youtube',
 376                 u'uilel': u'3',
 377                 u'hl': u'en_US',
 378         }
 379         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 380         # chokes on unicode
 381         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 382         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 383         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 384         try:
 385             self.report_login()
 386             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 387             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 388                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 389                 return
 390         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 391             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 392             return
 393
 394         # Confirm age
 395         age_form = {
 396                 'next_url':     '/',
 397                 'action_confirm':   'Confirm',
 398                 }
 399         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 400         try:
 401             self.report_age_confirmation()
 402             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 403         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 404             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 405             return
 406
 407     def _extract_id(self, url):
 408         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 409         if mobj is None:
 410             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 411             return
 412         video_id = mobj.group(2)
 413         return video_id
 414
 415     def _real_extract(self, url):
 416         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 417         mobj = re.search(self._NEXT_URL_RE, url)
 418         if mobj:
 419             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 420         video_id = self._extract_id(url)
 421
 422         # Get video webpage
 423         self.report_video_webpage_download(video_id)
 424         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 425         request = compat_urllib_request.Request(url)
 426         try:
 427             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 428         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 429             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 430             return
 431
 432         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 433
 434         # Attempt to extract SWF player URL
 435         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 436         if mobj is not None:
 437             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 438         else:
 439             player_url = None
 440
 441         # Get video info
 442         self.report_video_info_webpage_download(video_id)
 443         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 444             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 445                     % (video_id, el_type))
 446             request = compat_urllib_request.Request(video_info_url)
 447             try:
 448                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 449                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 450                 video_info = compat_parse_qs(video_info_webpage)
 451                 if 'token' in video_info:
 452                     break
 453             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 454                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 455                 return
 456         if 'token' not in video_info:
 457             if 'reason' in video_info:
 458                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 459             else:
 460                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 461             return
 462
 463         # Check for "rental" videos
 464         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 465             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 466             return
 467
 468         # Start extracting information
 469         self.report_information_extraction(video_id)
 470
 471         # uploader
 472         if 'author' not in video_info:
 473             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 474             return
 475         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 476
 477         # uploader_id
 478         video_uploader_id = None
 479         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 480         if mobj is not None:
 481             video_uploader_id = mobj.group(1)
 482         else:
 483             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 484
 485         # title
 486         if 'title' not in video_info:
 487             self._downloader.trouble(u'ERROR: unable to extract video title')
 488             return
 489         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 490
 491         # thumbnail image
 492         if 'thumbnail_url' not in video_info:
 493             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 494             video_thumbnail = ''
 495         else:   # don't panic if we can't find it
 496             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 497
 498         # upload date
 499         upload_date = None
 500         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 501         if mobj is not None:
 502             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 503             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 504             for expression in format_expressions:
 505                 try:
 506                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 507                 except:
 508                     pass
 509
 510         # description
 511         video_description = get_element_by_id("eow-description", video_webpage)
 512         if video_description:
 513             video_description = clean_html(video_description)
 514         else:
 515             video_description = ''
 516
 517         # subtitles
 518         video_subtitles = None
 519
 520         if self._downloader.params.get('writesubtitles', False):
 521             video_subtitles = self._extract_subtitle(video_id)
 522             if video_subtitles:
 523                 (sub_error, sub_lang, sub) = video_subtitles[0]
 524                 if sub_error:
 525                     self._downloader.trouble(sub_error)
 526
 527         if self._downloader.params.get('allsubtitles', False):
 528             video_subtitles = self._extract_all_subtitles(video_id)
 529             for video_subtitle in video_subtitles:
 530                 (sub_error, sub_lang, sub) = video_subtitle
 531                 if sub_error:
 532                     self._downloader.trouble(sub_error)
 533
 534         if self._downloader.params.get('listsubtitles', False):
 535             sub_lang_list = self._list_available_subtitles(video_id)
 536             return
 537
 538         if 'length_seconds' not in video_info:
 539             self._downloader.trouble(u'WARNING: unable to extract video duration')
 540             video_duration = ''
 541         else:
 542             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 543
 544         # token
 545         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 546
 547         # Decide which formats to download
 548         req_format = self._downloader.params.get('format', None)
 549
 550         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 551             self.report_rtmp_download()
 552             video_url_list = [(None, video_info['conn'][0])]
 553         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 554             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 555             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 556             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 557             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 558
 559             format_limit = self._downloader.params.get('format_limit', None)
 560             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 561             if format_limit is not None and format_limit in available_formats:
 562                 format_list = available_formats[available_formats.index(format_limit):]
 563             else:
 564                 format_list = available_formats
 565             existing_formats = [x for x in format_list if x in url_map]
 566             if len(existing_formats) == 0:
 567                 self._downloader.trouble(u'ERROR: no known formats available for video')
 568                 return
 569             if self._downloader.params.get('listformats', None):
 570                 self._print_formats(existing_formats)
 571                 return
 572             if req_format is None or req_format == 'best':
 573                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 574             elif req_format == 'worst':
 575                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 576             elif req_format in ('-1', 'all'):
 577                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 578             else:
 579                 # Specific formats. We pick the first in a slash-delimeted sequence.
 580                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 581                 req_formats = req_format.split('/')
 582                 video_url_list = None
 583                 for rf in req_formats:
 584                     if rf in url_map:
 585                         video_url_list = [(rf, url_map[rf])]
 586                         break
 587                 if video_url_list is None:
 588                     self._downloader.trouble(u'ERROR: requested format not available')
 589                     return
 590         else:
 591             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 592             return
 593
 594         results = []
 595         for format_param, video_real_url in video_url_list:
 596             # Extension
 597             video_extension = self._video_extensions.get(format_param, 'flv')
 598
 599             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 600                                               self._video_dimensions.get(format_param, '???'))
 601
 602             results.append({
 603                 'id':       video_id,
 604                 'url':      video_real_url,
 605                 'uploader': video_uploader,
 606                 'uploader_id': video_uploader_id,
 607                 'upload_date':  upload_date,
 608                 'title':    video_title,
 609                 'ext':      video_extension,
 610                 'format':   video_format,
 611                 'thumbnail':    video_thumbnail,
 612                 'description':  video_description,
 613                 'player_url':   player_url,
 614                 'subtitles':    video_subtitles,
 615                 'duration':     video_duration
 616             })
 617         return results
 618
 619
 620 class MetacafeIE(InfoExtractor):
 621     """Information Extractor for metacafe.com."""
 622
 623     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 624     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 625     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 626     IE_NAME = u'metacafe'
 627
 628     def __init__(self, downloader=None):
 629         InfoExtractor.__init__(self, downloader)
 630
 631     def report_disclaimer(self):
 632         """Report disclaimer retrieval."""
 633         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 634
 635     def report_age_confirmation(self):
 636         """Report attempt to confirm age."""
 637         self._downloader.to_screen(u'[metacafe] Confirming age')
 638
 639     def report_download_webpage(self, video_id):
 640         """Report webpage download."""
 641         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 642
 643     def report_extraction(self, video_id):
 644         """Report information extraction."""
 645         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 646
 647     def _real_initialize(self):
 648         # Retrieve disclaimer
 649         request = compat_urllib_request.Request(self._DISCLAIMER)
 650         try:
 651             self.report_disclaimer()
 652             disclaimer = compat_urllib_request.urlopen(request).read()
 653         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 654             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 655             return
 656
 657         # Confirm age
 658         disclaimer_form = {
 659             'filters': '0',
 660             'submit': "Continue - I'm over 18",
 661             }
 662         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 663         try:
 664             self.report_age_confirmation()
 665             disclaimer = compat_urllib_request.urlopen(request).read()
 666         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 667             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 668             return
 669
 670     def _real_extract(self, url):
 671         # Extract id and simplified title from URL
 672         mobj = re.match(self._VALID_URL, url)
 673         if mobj is None:
 674             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 675             return
 676
 677         video_id = mobj.group(1)
 678
 679         # Check if video comes from YouTube
 680         mobj2 = re.match(r'^yt-(.*)$', video_id)
 681         if mobj2 is not None:
 682             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 683             return
 684
 685         # Retrieve video webpage to extract further information
 686         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 687         try:
 688             self.report_download_webpage(video_id)
 689             webpage = compat_urllib_request.urlopen(request).read()
 690         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 691             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 692             return
 693
 694         # Extract URL, uploader and title from webpage
 695         self.report_extraction(video_id)
 696         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 697         if mobj is not None:
 698             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 699             video_extension = mediaURL[-3:]
 700
 701             # Extract gdaKey if available
 702             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 703             if mobj is None:
 704                 video_url = mediaURL
 705             else:
 706                 gdaKey = mobj.group(1)
 707                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 708         else:
 709             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 710             if mobj is None:
 711                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 712                 return
 713             vardict = compat_parse_qs(mobj.group(1))
 714             if 'mediaData' not in vardict:
 715                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 716                 return
 717             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 718             if mobj is None:
 719                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 720                 return
 721             mediaURL = mobj.group(1).replace('\\/', '/')
 722             video_extension = mediaURL[-3:]
 723             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 724
 725         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 726         if mobj is None:
 727             self._downloader.trouble(u'ERROR: unable to extract title')
 728             return
 729         video_title = mobj.group(1).decode('utf-8')
 730
 731         mobj = re.search(r'submitter=(.*?);', webpage)
 732         if mobj is None:
 733             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 734             return
 735         video_uploader = mobj.group(1)
 736
 737         return [{
 738             'id':       video_id.decode('utf-8'),
 739             'url':      video_url.decode('utf-8'),
 740             'uploader': video_uploader.decode('utf-8'),
 741             'upload_date':  None,
 742             'title':    video_title,
 743             'ext':      video_extension.decode('utf-8'),
 744         }]
 745
 746
 747 class DailymotionIE(InfoExtractor):
 748     """Information Extractor for Dailymotion"""
 749
 750     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 751     IE_NAME = u'dailymotion'
 752     _WORKING = False
 753
 754     def __init__(self, downloader=None):
 755         InfoExtractor.__init__(self, downloader)
 756
 757     def report_extraction(self, video_id):
 758         """Report information extraction."""
 759         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 760
 761     def _real_extract(self, url):
 762         # Extract id and simplified title from URL
 763         mobj = re.match(self._VALID_URL, url)
 764         if mobj is None:
 765             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 766             return
 767
 768         video_id = mobj.group(1).split('_')[0].split('?')[0]
 769
 770         video_extension = 'mp4'
 771
 772         # Retrieve video webpage to extract further information
 773         request = compat_urllib_request.Request(url)
 774         request.add_header('Cookie', 'family_filter=off')
 775         webpage = self._download_webpage(request, video_id)
 776
 777         # Extract URL, uploader and title from webpage
 778         self.report_extraction(video_id)
 779         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 780         if mobj is None:
 781             self._downloader.trouble(u'ERROR: unable to extract media URL')
 782             return
 783         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 784
 785         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 786             if key in flashvars:
 787                 max_quality = key
 788                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 789                 break
 790         else:
 791             self._downloader.trouble(u'ERROR: unable to extract video URL')
 792             return
 793
 794         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 795         if mobj is None:
 796             self._downloader.trouble(u'ERROR: unable to extract video URL')
 797             return
 798
 799         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 800
 801         # TODO: support choosing qualities
 802
 803         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 804         if mobj is None:
 805             self._downloader.trouble(u'ERROR: unable to extract title')
 806             return
 807         video_title = unescapeHTML(mobj.group('title'))
 808
 809         video_uploader = None
 810         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 811         if mobj is None:
 812             # lookin for official user
 813             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 814             if mobj_official is None:
 815                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 816             else:
 817                 video_uploader = mobj_official.group(1)
 818         else:
 819             video_uploader = mobj.group(1)
 820
 821         video_upload_date = None
 822         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 823         if mobj is not None:
 824             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 825
 826         return [{
 827             'id':       video_id,
 828             'url':      video_url,
 829             'uploader': video_uploader,
 830             'upload_date':  video_upload_date,
 831             'title':    video_title,
 832             'ext':      video_extension,
 833         }]
 834
 835
 836 class PhotobucketIE(InfoExtractor):
 837     """Information extractor for photobucket.com."""
 838
 839     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 840     IE_NAME = u'photobucket'
 841
 842     def __init__(self, downloader=None):
 843         InfoExtractor.__init__(self, downloader)
 844
 845     def report_download_webpage(self, video_id):
 846         """Report webpage download."""
 847         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 848
 849     def report_extraction(self, video_id):
 850         """Report information extraction."""
 851         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 852
 853     def _real_extract(self, url):
 854         # Extract id from URL
 855         mobj = re.match(self._VALID_URL, url)
 856         if mobj is None:
 857             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 858             return
 859
 860         video_id = mobj.group(1)
 861
 862         video_extension = 'flv'
 863
 864         # Retrieve video webpage to extract further information
 865         request = compat_urllib_request.Request(url)
 866         try:
 867             self.report_download_webpage(video_id)
 868             webpage = compat_urllib_request.urlopen(request).read()
 869         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 870             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 871             return
 872
 873         # Extract URL, uploader, and title from webpage
 874         self.report_extraction(video_id)
 875         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 876         if mobj is None:
 877             self._downloader.trouble(u'ERROR: unable to extract media URL')
 878             return
 879         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 880
 881         video_url = mediaURL
 882
 883         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 884         if mobj is None:
 885             self._downloader.trouble(u'ERROR: unable to extract title')
 886             return
 887         video_title = mobj.group(1).decode('utf-8')
 888
 889         video_uploader = mobj.group(2).decode('utf-8')
 890
 891         return [{
 892             'id':       video_id.decode('utf-8'),
 893             'url':      video_url.decode('utf-8'),
 894             'uploader': video_uploader,
 895             'upload_date':  None,
 896             'title':    video_title,
 897             'ext':      video_extension.decode('utf-8'),
 898         }]
 899
 900
 901 class YahooIE(InfoExtractor):
 902     """Information extractor for video.yahoo.com."""
 903
 904     _WORKING = False
 905     # _VALID_URL matches all Yahoo! Video URLs
 906     # _VPAGE_URL matches only the extractable '/watch/' URLs
 907     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 908     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 909     IE_NAME = u'video.yahoo'
 910
 911     def __init__(self, downloader=None):
 912         InfoExtractor.__init__(self, downloader)
 913
 914     def report_download_webpage(self, video_id):
 915         """Report webpage download."""
 916         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 917
 918     def report_extraction(self, video_id):
 919         """Report information extraction."""
 920         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 921
 922     def _real_extract(self, url, new_video=True):
 923         # Extract ID from URL
 924         mobj = re.match(self._VALID_URL, url)
 925         if mobj is None:
 926             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 927             return
 928
 929         video_id = mobj.group(2)
 930         video_extension = 'flv'
 931
 932         # Rewrite valid but non-extractable URLs as
 933         # extractable English language /watch/ URLs
 934         if re.match(self._VPAGE_URL, url) is None:
 935             request = compat_urllib_request.Request(url)
 936             try:
 937                 webpage = compat_urllib_request.urlopen(request).read()
 938             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 939                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 940                 return
 941
 942             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 943             if mobj is None:
 944                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 945                 return
 946             yahoo_id = mobj.group(1)
 947
 948             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 949             if mobj is None:
 950                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 951                 return
 952             yahoo_vid = mobj.group(1)
 953
 954             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 955             return self._real_extract(url, new_video=False)
 956
 957         # Retrieve video webpage to extract further information
 958         request = compat_urllib_request.Request(url)
 959         try:
 960             self.report_download_webpage(video_id)
 961             webpage = compat_urllib_request.urlopen(request).read()
 962         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 963             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 964             return
 965
 966         # Extract uploader and title from webpage
 967         self.report_extraction(video_id)
 968         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 969         if mobj is None:
 970             self._downloader.trouble(u'ERROR: unable to extract video title')
 971             return
 972         video_title = mobj.group(1).decode('utf-8')
 973
 974         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 975         if mobj is None:
 976             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 977             return
 978         video_uploader = mobj.group(1).decode('utf-8')
 979
 980         # Extract video thumbnail
 981         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 982         if mobj is None:
 983             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 984             return
 985         video_thumbnail = mobj.group(1).decode('utf-8')
 986
 987         # Extract video description
 988         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 989         if mobj is None:
 990             self._downloader.trouble(u'ERROR: unable to extract video description')
 991             return
 992         video_description = mobj.group(1).decode('utf-8')
 993         if not video_description:
 994             video_description = 'No description available.'
 995
 996         # Extract video height and width
 997         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 998         if mobj is None:
 999             self._downloader.trouble(u'ERROR: unable to extract video height')
1000             return
1001         yv_video_height = mobj.group(1)
1002
1003         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1004         if mobj is None:
1005             self._downloader.trouble(u'ERROR: unable to extract video width')
1006             return
1007         yv_video_width = mobj.group(1)
1008
1009         # Retrieve video playlist to extract media URL
1010         # I'm not completely sure what all these options are, but we
1011         # seem to need most of them, otherwise the server sends a 401.
1012         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1013         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1014         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1015                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1016                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1017         try:
1018             self.report_download_webpage(video_id)
1019             webpage = compat_urllib_request.urlopen(request).read()
1020         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1021             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1022             return
1023
1024         # Extract media URL from playlist XML
1025         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1026         if mobj is None:
1027             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1028             return
1029         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1030         video_url = unescapeHTML(video_url)
1031
1032         return [{
1033             'id':       video_id.decode('utf-8'),
1034             'url':      video_url,
1035             'uploader': video_uploader,
1036             'upload_date':  None,
1037             'title':    video_title,
1038             'ext':      video_extension.decode('utf-8'),
1039             'thumbnail':    video_thumbnail.decode('utf-8'),
1040             'description':  video_description,
1041         }]
1042
1043
1044 class VimeoIE(InfoExtractor):
1045     """Information extractor for vimeo.com."""
1046
1047     # _VALID_URL matches Vimeo URLs
1048     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1049     IE_NAME = u'vimeo'
1050
1051     def __init__(self, downloader=None):
1052         InfoExtractor.__init__(self, downloader)
1053
1054     def report_download_webpage(self, video_id):
1055         """Report webpage download."""
1056         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1057
1058     def report_extraction(self, video_id):
1059         """Report information extraction."""
1060         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1061
1062     def _real_extract(self, url, new_video=True):
1063         # Extract ID from URL
1064         mobj = re.match(self._VALID_URL, url)
1065         if mobj is None:
1066             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1067             return
1068
1069         video_id = mobj.group('id')
1070         if not mobj.group('proto'):
1071             url = 'https://' + url
1072         if mobj.group('direct_link'):
1073             url = 'https://vimeo.com/' + video_id
1074
1075         # Retrieve video webpage to extract further information
1076         request = compat_urllib_request.Request(url, None, std_headers)
1077         try:
1078             self.report_download_webpage(video_id)
1079             webpage_bytes = compat_urllib_request.urlopen(request).read()
1080             webpage = webpage_bytes.decode('utf-8')
1081         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1082             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1083             return
1084
1085         # Now we begin extracting as much information as we can from what we
1086         # retrieved. First we extract the information common to all extractors,
1087         # and latter we extract those that are Vimeo specific.
1088         self.report_extraction(video_id)
1089
1090         # Extract the config JSON
1091         try:
1092             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1093             config = json.loads(config)
1094         except:
1095             self._downloader.trouble(u'ERROR: unable to extract info section')
1096             return
1097
1098         # Extract title
1099         video_title = config["video"]["title"]
1100
1101         # Extract uploader and uploader_id
1102         video_uploader = config["video"]["owner"]["name"]
1103         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1104
1105         # Extract video thumbnail
1106         video_thumbnail = config["video"]["thumbnail"]
1107
1108         # Extract video description
1109         video_description = get_element_by_attribute("itemprop", "description", webpage)
1110         if video_description: video_description = clean_html(video_description)
1111         else: video_description = ''
1112
1113         # Extract upload date
1114         video_upload_date = None
1115         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1116         if mobj is not None:
1117             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1118
1119         # Vimeo specific: extract request signature and timestamp
1120         sig = config['request']['signature']
1121         timestamp = config['request']['timestamp']
1122
1123         # Vimeo specific: extract video codec and quality information
1124         # First consider quality, then codecs, then take everything
1125         # TODO bind to format param
1126         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1127         files = { 'hd': [], 'sd': [], 'other': []}
1128         for codec_name, codec_extension in codecs:
1129             if codec_name in config["video"]["files"]:
1130                 if 'hd' in config["video"]["files"][codec_name]:
1131                     files['hd'].append((codec_name, codec_extension, 'hd'))
1132                 elif 'sd' in config["video"]["files"][codec_name]:
1133                     files['sd'].append((codec_name, codec_extension, 'sd'))
1134                 else:
1135                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1136
1137         for quality in ('hd', 'sd', 'other'):
1138             if len(files[quality]) > 0:
1139                 video_quality = files[quality][0][2]
1140                 video_codec = files[quality][0][0]
1141                 video_extension = files[quality][0][1]
1142                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1143                 break
1144         else:
1145             self._downloader.trouble(u'ERROR: no known codec found')
1146             return
1147
1148         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1149                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1150
1151         return [{
1152             'id':       video_id,
1153             'url':      video_url,
1154             'uploader': video_uploader,
1155             'uploader_id': video_uploader_id,
1156             'upload_date':  video_upload_date,
1157             'title':    video_title,
1158             'ext':      video_extension,
1159             'thumbnail':    video_thumbnail,
1160             'description':  video_description,
1161         }]
1162
1163
1164 class ArteTvIE(InfoExtractor):
1165     """arte.tv information extractor."""
1166
1167     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1168     _LIVE_URL = r'index-[0-9]+\.html$'
1169
1170     IE_NAME = u'arte.tv'
1171
1172     def __init__(self, downloader=None):
1173         InfoExtractor.__init__(self, downloader)
1174
1175     def report_download_webpage(self, video_id):
1176         """Report webpage download."""
1177         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1178
1179     def report_extraction(self, video_id):
1180         """Report information extraction."""
1181         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1182
1183     def fetch_webpage(self, url):
1184         request = compat_urllib_request.Request(url)
1185         try:
1186             self.report_download_webpage(url)
1187             webpage = compat_urllib_request.urlopen(request).read()
1188         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1189             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1190             return
1191         except ValueError as err:
1192             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1193             return
1194         return webpage
1195
1196     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1197         page = self.fetch_webpage(url)
1198         mobj = re.search(regex, page, regexFlags)
1199         info = {}
1200
1201         if mobj is None:
1202             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1203             return
1204
1205         for (i, key, err) in matchTuples:
1206             if mobj.group(i) is None:
1207                 self._downloader.trouble(err)
1208                 return
1209             else:
1210                 info[key] = mobj.group(i)
1211
1212         return info
1213
1214     def extractLiveStream(self, url):
1215         video_lang = url.split('/')[-4]
1216         info = self.grep_webpage(
1217             url,
1218             r'src="(.*?/videothek_js.*?\.js)',
1219             0,
1220             [
1221                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1222             ]
1223         )
1224         http_host = url.split('/')[2]
1225         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1226         info = self.grep_webpage(
1227             next_url,
1228             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1229                 '(http://.*?\.swf).*?' +
1230                 '(rtmp://.*?)\'',
1231             re.DOTALL,
1232             [
1233                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1234                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1235                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1236             ]
1237         )
1238         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1239
1240     def extractPlus7Stream(self, url):
1241         video_lang = url.split('/')[-3]
1242         info = self.grep_webpage(
1243             url,
1244             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1245             0,
1246             [
1247                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1248             ]
1249         )
1250         next_url = compat_urllib_parse.unquote(info.get('url'))
1251         info = self.grep_webpage(
1252             next_url,
1253             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1254             0,
1255             [
1256                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1257             ]
1258         )
1259         next_url = compat_urllib_parse.unquote(info.get('url'))
1260
1261         info = self.grep_webpage(
1262             next_url,
1263             r'<video id="(.*?)".*?>.*?' +
1264                 '<name>(.*?)</name>.*?' +
1265                 '<dateVideo>(.*?)</dateVideo>.*?' +
1266                 '<url quality="hd">(.*?)</url>',
1267             re.DOTALL,
1268             [
1269                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1270                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1271                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1272                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1273             ]
1274         )
1275
1276         return {
1277             'id':           info.get('id'),
1278             'url':          compat_urllib_parse.unquote(info.get('url')),
1279             'uploader':     u'arte.tv',
1280             'upload_date':  info.get('date'),
1281             'title':        info.get('title').decode('utf-8'),
1282             'ext':          u'mp4',
1283             'format':       u'NA',
1284             'player_url':   None,
1285         }
1286
1287     def _real_extract(self, url):
1288         video_id = url.split('/')[-1]
1289         self.report_extraction(video_id)
1290
1291         if re.search(self._LIVE_URL, video_id) is not None:
1292             self.extractLiveStream(url)
1293             return
1294         else:
1295             info = self.extractPlus7Stream(url)
1296
1297         return [info]
1298
1299
1300 class GenericIE(InfoExtractor):
1301     """Generic last-resort information extractor."""
1302
1303     _VALID_URL = r'.*'
1304     IE_NAME = u'generic'
1305
1306     def __init__(self, downloader=None):
1307         InfoExtractor.__init__(self, downloader)
1308
1309     def report_download_webpage(self, video_id):
1310         """Report webpage download."""
1311         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1312         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1313
1314     def report_extraction(self, video_id):
1315         """Report information extraction."""
1316         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1317
1318     def report_following_redirect(self, new_url):
1319         """Report information extraction."""
1320         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1321
1322     def _test_redirect(self, url):
1323         """Check if it is a redirect, like url shorteners, in case restart chain."""
1324         class HeadRequest(compat_urllib_request.Request):
1325             def get_method(self):
1326                 return "HEAD"
1327
1328         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1329             """
1330             Subclass the HTTPRedirectHandler to make it use our
1331             HeadRequest also on the redirected URL
1332             """
1333             def redirect_request(self, req, fp, code, msg, headers, newurl):
1334                 if code in (301, 302, 303, 307):
1335                     newurl = newurl.replace(' ', '%20')
1336                     newheaders = dict((k,v) for k,v in req.headers.items()
1337                                       if k.lower() not in ("content-length", "content-type"))
1338                     return HeadRequest(newurl,
1339                                        headers=newheaders,
1340                                        origin_req_host=req.get_origin_req_host(),
1341                                        unverifiable=True)
1342                 else:
1343                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1344
1345         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1346             """
1347             Fallback to GET if HEAD is not allowed (405 HTTP error)
1348             """
1349             def http_error_405(self, req, fp, code, msg, headers):
1350                 fp.read()
1351                 fp.close()
1352
1353                 newheaders = dict((k,v) for k,v in req.headers.items()
1354                                   if k.lower() not in ("content-length", "content-type"))
1355                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1356                                                  headers=newheaders,
1357                                                  origin_req_host=req.get_origin_req_host(),
1358                                                  unverifiable=True))
1359
1360         # Build our opener
1361         opener = compat_urllib_request.OpenerDirector()
1362         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1363                         HTTPMethodFallback, HEADRedirectHandler,
1364                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1365             opener.add_handler(handler())
1366
1367         response = opener.open(HeadRequest(url))
1368         new_url = response.geturl()
1369
1370         if url == new_url:
1371             return False
1372
1373         self.report_following_redirect(new_url)
1374         self._downloader.download([new_url])
1375         return True
1376
1377     def _real_extract(self, url):
1378         if self._test_redirect(url): return
1379
1380         video_id = url.split('/')[-1]
1381         request = compat_urllib_request.Request(url)
1382         try:
1383             self.report_download_webpage(video_id)
1384             webpage = compat_urllib_request.urlopen(request).read()
1385         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1386             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1387             return
1388         except ValueError as err:
1389             # since this is the last-resort InfoExtractor, if
1390             # this error is thrown, it'll be thrown here
1391             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1392             return
1393
1394         self.report_extraction(video_id)
1395         # Start with something easy: JW Player in SWFObject
1396         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1397         if mobj is None:
1398             # Broaden the search a little bit
1399             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1400         if mobj is None:
1401             # Broaden the search a little bit: JWPlayer JS loader
1402             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1403         if mobj is None:
1404             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1405             return
1406
1407         # It's possible that one of the regexes
1408         # matched, but returned an empty group:
1409         if mobj.group(1) is None:
1410             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1411             return
1412
1413         video_url = compat_urllib_parse.unquote(mobj.group(1))
1414         video_id = os.path.basename(video_url)
1415
1416         # here's a fun little line of code for you:
1417         video_extension = os.path.splitext(video_id)[1][1:]
1418         video_id = os.path.splitext(video_id)[0]
1419
1420         # it's tempting to parse this further, but you would
1421         # have to take into account all the variations like
1422         #   Video Title - Site Name
1423         #   Site Name | Video Title
1424         #   Video Title - Tagline | Site Name
1425         # and so on and so forth; it's just not practical
1426         mobj = re.search(r'<title>(.*)</title>', webpage)
1427         if mobj is None:
1428             self._downloader.trouble(u'ERROR: unable to extract title')
1429             return
1430         video_title = mobj.group(1)
1431
1432         # video uploader is domain name
1433         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1434         if mobj is None:
1435             self._downloader.trouble(u'ERROR: unable to extract title')
1436             return
1437         video_uploader = mobj.group(1)
1438
1439         return [{
1440             'id':       video_id,
1441             'url':      video_url,
1442             'uploader': video_uploader,
1443             'upload_date':  None,
1444             'title':    video_title,
1445             'ext':      video_extension,
1446         }]
1447
1448
1449 class YoutubeSearchIE(InfoExtractor):
1450     """Information Extractor for YouTube search queries."""
1451     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1452     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1453     _max_youtube_results = 1000
1454     IE_NAME = u'youtube:search'
1455
1456     def __init__(self, downloader=None):
1457         InfoExtractor.__init__(self, downloader)
1458
1459     def report_download_page(self, query, pagenum):
1460         """Report attempt to download search page with given number."""
1461         query = query.decode(preferredencoding())
1462         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1463
1464     def _real_extract(self, query):
1465         mobj = re.match(self._VALID_URL, query)
1466         if mobj is None:
1467             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1468             return
1469
1470         prefix, query = query.split(':')
1471         prefix = prefix[8:]
1472         query = query.encode('utf-8')
1473         if prefix == '':
1474             self._download_n_results(query, 1)
1475             return
1476         elif prefix == 'all':
1477             self._download_n_results(query, self._max_youtube_results)
1478             return
1479         else:
1480             try:
1481                 n = int(prefix)
1482                 if n <= 0:
1483                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1484                     return
1485                 elif n > self._max_youtube_results:
1486                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1487                     n = self._max_youtube_results
1488                 self._download_n_results(query, n)
1489                 return
1490             except ValueError: # parsing prefix as integer fails
1491                 self._download_n_results(query, 1)
1492                 return
1493
1494     def _download_n_results(self, query, n):
1495         """Downloads a specified number of results for a query"""
1496
1497         video_ids = []
1498         pagenum = 0
1499         limit = n
1500
1501         while (50 * pagenum) < limit:
1502             self.report_download_page(query, pagenum+1)
1503             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1504             request = compat_urllib_request.Request(result_url)
1505             try:
1506                 data = compat_urllib_request.urlopen(request).read()
1507             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1508                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1509                 return
1510             api_response = json.loads(data)['data']
1511
1512             new_ids = list(video['id'] for video in api_response['items'])
1513             video_ids += new_ids
1514
1515             limit = min(n, api_response['totalItems'])
1516             pagenum += 1
1517
1518         if len(video_ids) > n:
1519             video_ids = video_ids[:n]
1520         for id in video_ids:
1521             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1522         return
1523
1524
1525 class GoogleSearchIE(InfoExtractor):
1526     """Information Extractor for Google Video search queries."""
1527     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1528     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1529     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1530     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1531     _max_google_results = 1000
1532     IE_NAME = u'video.google:search'
1533
1534     def __init__(self, downloader=None):
1535         InfoExtractor.__init__(self, downloader)
1536
1537     def report_download_page(self, query, pagenum):
1538         """Report attempt to download playlist page with given number."""
1539         query = query.decode(preferredencoding())
1540         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1541
1542     def _real_extract(self, query):
1543         mobj = re.match(self._VALID_URL, query)
1544         if mobj is None:
1545             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1546             return
1547
1548         prefix, query = query.split(':')
1549         prefix = prefix[8:]
1550         query = query.encode('utf-8')
1551         if prefix == '':
1552             self._download_n_results(query, 1)
1553             return
1554         elif prefix == 'all':
1555             self._download_n_results(query, self._max_google_results)
1556             return
1557         else:
1558             try:
1559                 n = int(prefix)
1560                 if n <= 0:
1561                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1562                     return
1563                 elif n > self._max_google_results:
1564                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1565                     n = self._max_google_results
1566                 self._download_n_results(query, n)
1567                 return
1568             except ValueError: # parsing prefix as integer fails
1569                 self._download_n_results(query, 1)
1570                 return
1571
1572     def _download_n_results(self, query, n):
1573         """Downloads a specified number of results for a query"""
1574
1575         video_ids = []
1576         pagenum = 0
1577
1578         while True:
1579             self.report_download_page(query, pagenum)
1580             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1581             request = compat_urllib_request.Request(result_url)
1582             try:
1583                 page = compat_urllib_request.urlopen(request).read()
1584             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1585                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1586                 return
1587
1588             # Extract video identifiers
1589             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1590                 video_id = mobj.group(1)
1591                 if video_id not in video_ids:
1592                     video_ids.append(video_id)
1593                     if len(video_ids) == n:
1594                         # Specified n videos reached
1595                         for id in video_ids:
1596                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1597                         return
1598
1599             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1600                 for id in video_ids:
1601                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1602                 return
1603
1604             pagenum = pagenum + 1
1605
1606
1607 class YahooSearchIE(InfoExtractor):
1608     """Information Extractor for Yahoo! Video search queries."""
1609
1610     _WORKING = False
1611     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1612     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1613     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1614     _MORE_PAGES_INDICATOR = r'\s*Next'
1615     _max_yahoo_results = 1000
1616     IE_NAME = u'video.yahoo:search'
1617
1618     def __init__(self, downloader=None):
1619         InfoExtractor.__init__(self, downloader)
1620
1621     def report_download_page(self, query, pagenum):
1622         """Report attempt to download playlist page with given number."""
1623         query = query.decode(preferredencoding())
1624         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1625
1626     def _real_extract(self, query):
1627         mobj = re.match(self._VALID_URL, query)
1628         if mobj is None:
1629             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1630             return
1631
1632         prefix, query = query.split(':')
1633         prefix = prefix[8:]
1634         query = query.encode('utf-8')
1635         if prefix == '':
1636             self._download_n_results(query, 1)
1637             return
1638         elif prefix == 'all':
1639             self._download_n_results(query, self._max_yahoo_results)
1640             return
1641         else:
1642             try:
1643                 n = int(prefix)
1644                 if n <= 0:
1645                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1646                     return
1647                 elif n > self._max_yahoo_results:
1648                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1649                     n = self._max_yahoo_results
1650                 self._download_n_results(query, n)
1651                 return
1652             except ValueError: # parsing prefix as integer fails
1653                 self._download_n_results(query, 1)
1654                 return
1655
1656     def _download_n_results(self, query, n):
1657         """Downloads a specified number of results for a query"""
1658
1659         video_ids = []
1660         already_seen = set()
1661         pagenum = 1
1662
1663         while True:
1664             self.report_download_page(query, pagenum)
1665             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1666             request = compat_urllib_request.Request(result_url)
1667             try:
1668                 page = compat_urllib_request.urlopen(request).read()
1669             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1670                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1671                 return
1672
1673             # Extract video identifiers
1674             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1675                 video_id = mobj.group(1)
1676                 if video_id not in already_seen:
1677                     video_ids.append(video_id)
1678                     already_seen.add(video_id)
1679                     if len(video_ids) == n:
1680                         # Specified n videos reached
1681                         for id in video_ids:
1682                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1683                         return
1684
1685             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1686                 for id in video_ids:
1687                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1688                 return
1689
1690             pagenum = pagenum + 1
1691
1692
1693 class YoutubePlaylistIE(InfoExtractor):
1694     """Information Extractor for YouTube playlists."""
1695
1696     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1697     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1698     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1699     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1700     IE_NAME = u'youtube:playlist'
1701
1702     def __init__(self, downloader=None):
1703         InfoExtractor.__init__(self, downloader)
1704
1705     def report_download_page(self, playlist_id, pagenum):
1706         """Report attempt to download playlist page with given number."""
1707         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1708
1709     def _real_extract(self, url):
1710         # Extract playlist id
1711         mobj = re.match(self._VALID_URL, url)
1712         if mobj is None:
1713             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1714             return
1715
1716         # Single video case
1717         if mobj.group(3) is not None:
1718             self._downloader.download([mobj.group(3)])
1719             return
1720
1721         # Download playlist pages
1722         # prefix is 'p' as default for playlists but there are other types that need extra care
1723         playlist_prefix = mobj.group(1)
1724         if playlist_prefix == 'a':
1725             playlist_access = 'artist'
1726         else:
1727             playlist_prefix = 'p'
1728             playlist_access = 'view_play_list'
1729         playlist_id = mobj.group(2)
1730         video_ids = []
1731         pagenum = 1
1732
1733         while True:
1734             self.report_download_page(playlist_id, pagenum)
1735             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1736             request = compat_urllib_request.Request(url)
1737             try:
1738                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1739             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1740                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1741                 return
1742
1743             # Extract video identifiers
1744             ids_in_page = []
1745             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1746                 if mobj.group(1) not in ids_in_page:
1747                     ids_in_page.append(mobj.group(1))
1748             video_ids.extend(ids_in_page)
1749
1750             if self._MORE_PAGES_INDICATOR not in page:
1751                 break
1752             pagenum = pagenum + 1
1753
1754         total = len(video_ids)
1755
1756         playliststart = self._downloader.params.get('playliststart', 1) - 1
1757         playlistend = self._downloader.params.get('playlistend', -1)
1758         if playlistend == -1:
1759             video_ids = video_ids[playliststart:]
1760         else:
1761             video_ids = video_ids[playliststart:playlistend]
1762
1763         if len(video_ids) == total:
1764             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1765         else:
1766             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1767
1768         for id in video_ids:
1769             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1770         return
1771
1772
1773 class YoutubeChannelIE(InfoExtractor):
1774     """Information Extractor for YouTube channels."""
1775
1776     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1777     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1778     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1779     IE_NAME = u'youtube:channel'
1780
1781     def report_download_page(self, channel_id, pagenum):
1782         """Report attempt to download channel page with given number."""
1783         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1784
1785     def _real_extract(self, url):
1786         # Extract channel id
1787         mobj = re.match(self._VALID_URL, url)
1788         if mobj is None:
1789             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1790             return
1791
1792         # Download channel pages
1793         channel_id = mobj.group(1)
1794         video_ids = []
1795         pagenum = 1
1796
1797         while True:
1798             self.report_download_page(channel_id, pagenum)
1799             url = self._TEMPLATE_URL % (channel_id, pagenum)
1800             request = compat_urllib_request.Request(url)
1801             try:
1802                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1803             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1804                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1805                 return
1806
1807             # Extract video identifiers
1808             ids_in_page = []
1809             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1810                 if mobj.group(1) not in ids_in_page:
1811                     ids_in_page.append(mobj.group(1))
1812             video_ids.extend(ids_in_page)
1813
1814             if self._MORE_PAGES_INDICATOR not in page:
1815                 break
1816             pagenum = pagenum + 1
1817
1818         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1819
1820         for id in video_ids:
1821             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1822         return
1823
1824
1825 class YoutubeUserIE(InfoExtractor):
1826     """Information Extractor for YouTube users."""
1827
1828     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1829     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1830     _GDATA_PAGE_SIZE = 50
1831     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1832     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1833     IE_NAME = u'youtube:user'
1834
1835     def __init__(self, downloader=None):
1836         InfoExtractor.__init__(self, downloader)
1837
1838     def report_download_page(self, username, start_index):
1839         """Report attempt to download user page."""
1840         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1841                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1842
1843     def _real_extract(self, url):
1844         # Extract username
1845         mobj = re.match(self._VALID_URL, url)
1846         if mobj is None:
1847             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1848             return
1849
1850         username = mobj.group(1)
1851
1852         # Download video ids using YouTube Data API. Result size per
1853         # query is limited (currently to 50 videos) so we need to query
1854         # page by page until there are no video ids - it means we got
1855         # all of them.
1856
1857         video_ids = []
1858         pagenum = 0
1859
1860         while True:
1861             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1862             self.report_download_page(username, start_index)
1863
1864             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1865
1866             try:
1867                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1868             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1869                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1870                 return
1871
1872             # Extract video identifiers
1873             ids_in_page = []
1874
1875             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1876                 if mobj.group(1) not in ids_in_page:
1877                     ids_in_page.append(mobj.group(1))
1878
1879             video_ids.extend(ids_in_page)
1880
1881             # A little optimization - if current page is not
1882             # "full", ie. does not contain PAGE_SIZE video ids then
1883             # we can assume that this page is the last one - there
1884             # are no more ids on further pages - no need to query
1885             # again.
1886
1887             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1888                 break
1889
1890             pagenum += 1
1891
1892         all_ids_count = len(video_ids)
1893         playliststart = self._downloader.params.get('playliststart', 1) - 1
1894         playlistend = self._downloader.params.get('playlistend', -1)
1895
1896         if playlistend == -1:
1897             video_ids = video_ids[playliststart:]
1898         else:
1899             video_ids = video_ids[playliststart:playlistend]
1900
1901         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1902                 (username, all_ids_count, len(video_ids)))
1903
1904         for video_id in video_ids:
1905             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1906
1907
1908 class BlipTVUserIE(InfoExtractor):
1909     """Information Extractor for blip.tv users."""
1910
1911     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1912     _PAGE_SIZE = 12
1913     IE_NAME = u'blip.tv:user'
1914
1915     def __init__(self, downloader=None):
1916         InfoExtractor.__init__(self, downloader)
1917
1918     def report_download_page(self, username, pagenum):
1919         """Report attempt to download user page."""
1920         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1921                 (self.IE_NAME, username, pagenum))
1922
1923     def _real_extract(self, url):
1924         # Extract username
1925         mobj = re.match(self._VALID_URL, url)
1926         if mobj is None:
1927             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1928             return
1929
1930         username = mobj.group(1)
1931
1932         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1933
1934         request = compat_urllib_request.Request(url)
1935
1936         try:
1937             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1938             mobj = re.search(r'data-users-id="([^"]+)"', page)
1939             page_base = page_base % mobj.group(1)
1940         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1941             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1942             return
1943
1944
1945         # Download video ids using BlipTV Ajax calls. Result size per
1946         # query is limited (currently to 12 videos) so we need to query
1947         # page by page until there are no video ids - it means we got
1948         # all of them.
1949
1950         video_ids = []
1951         pagenum = 1
1952
1953         while True:
1954             self.report_download_page(username, pagenum)
1955
1956             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1957
1958             try:
1959                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1960             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1961                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1962                 return
1963
1964             # Extract video identifiers
1965             ids_in_page = []
1966
1967             for mobj in re.finditer(r'href="/([^"]+)"', page):
1968                 if mobj.group(1) not in ids_in_page:
1969                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1970
1971             video_ids.extend(ids_in_page)
1972
1973             # A little optimization - if current page is not
1974             # "full", ie. does not contain PAGE_SIZE video ids then
1975             # we can assume that this page is the last one - there
1976             # are no more ids on further pages - no need to query
1977             # again.
1978
1979             if len(ids_in_page) < self._PAGE_SIZE:
1980                 break
1981
1982             pagenum += 1
1983
1984         all_ids_count = len(video_ids)
1985         playliststart = self._downloader.params.get('playliststart', 1) - 1
1986         playlistend = self._downloader.params.get('playlistend', -1)
1987
1988         if playlistend == -1:
1989             video_ids = video_ids[playliststart:]
1990         else:
1991             video_ids = video_ids[playliststart:playlistend]
1992
1993         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1994                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1995
1996         for video_id in video_ids:
1997             self._downloader.download([u'http://blip.tv/'+video_id])
1998
1999
2000 class DepositFilesIE(InfoExtractor):
2001     """Information extractor for depositfiles.com"""
2002
2003     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2004
2005     def report_download_webpage(self, file_id):
2006         """Report webpage download."""
2007         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2008
2009     def report_extraction(self, file_id):
2010         """Report information extraction."""
2011         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2012
2013     def _real_extract(self, url):
2014         file_id = url.split('/')[-1]
2015         # Rebuild url in english locale
2016         url = 'http://depositfiles.com/en/files/' + file_id
2017
2018         # Retrieve file webpage with 'Free download' button pressed
2019         free_download_indication = { 'gateway_result' : '1' }
2020         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2021         try:
2022             self.report_download_webpage(file_id)
2023             webpage = compat_urllib_request.urlopen(request).read()
2024         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2025             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2026             return
2027
2028         # Search for the real file URL
2029         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2030         if (mobj is None) or (mobj.group(1) is None):
2031             # Try to figure out reason of the error.
2032             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2033             if (mobj is not None) and (mobj.group(1) is not None):
2034                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2035                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2036             else:
2037                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2038             return
2039
2040         file_url = mobj.group(1)
2041         file_extension = os.path.splitext(file_url)[1][1:]
2042
2043         # Search for file title
2044         mobj = re.search(r'<b title="(.*?)">', webpage)
2045         if mobj is None:
2046             self._downloader.trouble(u'ERROR: unable to extract title')
2047             return
2048         file_title = mobj.group(1).decode('utf-8')
2049
2050         return [{
2051             'id':       file_id.decode('utf-8'),
2052             'url':      file_url.decode('utf-8'),
2053             'uploader': None,
2054             'upload_date':  None,
2055             'title':    file_title,
2056             'ext':      file_extension.decode('utf-8'),
2057         }]
2058
2059
2060 class FacebookIE(InfoExtractor):
2061     """Information Extractor for Facebook"""
2062
2063     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2064     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2065     _NETRC_MACHINE = 'facebook'
2066     IE_NAME = u'facebook'
2067
2068     def report_login(self):
2069         """Report attempt to log in."""
2070         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2071
2072     def _real_initialize(self):
2073         if self._downloader is None:
2074             return
2075
2076         useremail = None
2077         password = None
2078         downloader_params = self._downloader.params
2079
2080         # Attempt to use provided username and password or .netrc data
2081         if downloader_params.get('username', None) is not None:
2082             useremail = downloader_params['username']
2083             password = downloader_params['password']
2084         elif downloader_params.get('usenetrc', False):
2085             try:
2086                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2087                 if info is not None:
2088                     useremail = info[0]
2089                     password = info[2]
2090                 else:
2091                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2092             except (IOError, netrc.NetrcParseError) as err:
2093                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2094                 return
2095
2096         if useremail is None:
2097             return
2098
2099         # Log in
2100         login_form = {
2101             'email': useremail,
2102             'pass': password,
2103             'login': 'Log+In'
2104             }
2105         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2106         try:
2107             self.report_login()
2108             login_results = compat_urllib_request.urlopen(request).read()
2109             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2110                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2111                 return
2112         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2113             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2114             return
2115
2116     def _real_extract(self, url):
2117         mobj = re.match(self._VALID_URL, url)
2118         if mobj is None:
2119             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2120             return
2121         video_id = mobj.group('ID')
2122
2123         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2124         webpage = self._download_webpage(url, video_id)
2125
2126         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2127         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2128         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2129         if not m:
2130             raise ExtractorError(u'Cannot parse data')
2131         data = dict(json.loads(m.group(1)))
2132         params_raw = compat_urllib_parse.unquote(data['params'])
2133         params = json.loads(params_raw)
2134         video_url = params['hd_src']
2135         if not video_url:
2136             video_url = params['sd_src']
2137         if not video_url:
2138             raise ExtractorError(u'Cannot find video URL')
2139         video_duration = int(params['video_duration'])
2140
2141         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2142         if not m:
2143             raise ExtractorError(u'Cannot find title in webpage')
2144         video_title = unescapeHTML(m.group(1))
2145
2146         info = {
2147             'id': video_id,
2148             'title': video_title,
2149             'url': video_url,
2150             'ext': 'mp4',
2151             'duration': video_duration,
2152             'thumbnail': params['thumbnail_src'],
2153         }
2154         return [info]
2155
2156
2157 class BlipTVIE(InfoExtractor):
2158     """Information extractor for blip.tv"""
2159
2160     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2161     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2162     IE_NAME = u'blip.tv'
2163
2164     def report_extraction(self, file_id):
2165         """Report information extraction."""
2166         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2167
2168     def report_direct_download(self, title):
2169         """Report information extraction."""
2170         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2171
2172     def _real_extract(self, url):
2173         mobj = re.match(self._VALID_URL, url)
2174         if mobj is None:
2175             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2176             return
2177
2178         if '?' in url:
2179             cchar = '&'
2180         else:
2181             cchar = '?'
2182         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2183         request = compat_urllib_request.Request(json_url)
2184         request.add_header('User-Agent', 'iTunes/10.6.1')
2185         self.report_extraction(mobj.group(1))
2186         info = None
2187         try:
2188             urlh = compat_urllib_request.urlopen(request)
2189             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2190                 basename = url.split('/')[-1]
2191                 title,ext = os.path.splitext(basename)
2192                 title = title.decode('UTF-8')
2193                 ext = ext.replace('.', '')
2194                 self.report_direct_download(title)
2195                 info = {
2196                     'id': title,
2197                     'url': url,
2198                     'uploader': None,
2199                     'upload_date': None,
2200                     'title': title,
2201                     'ext': ext,
2202                     'urlhandle': urlh
2203                 }
2204         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2205             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2206         if info is None: # Regular URL
2207             try:
2208                 json_code_bytes = urlh.read()
2209                 json_code = json_code_bytes.decode('utf-8')
2210             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2211                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2212                 return
2213
2214             try:
2215                 json_data = json.loads(json_code)
2216                 if 'Post' in json_data:
2217                     data = json_data['Post']
2218                 else:
2219                     data = json_data
2220
2221                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2222                 video_url = data['media']['url']
2223                 umobj = re.match(self._URL_EXT, video_url)
2224                 if umobj is None:
2225                     raise ValueError('Can not determine filename extension')
2226                 ext = umobj.group(1)
2227
2228                 info = {
2229                     'id': data['item_id'],
2230                     'url': video_url,
2231                     'uploader': data['display_name'],
2232                     'upload_date': upload_date,
2233                     'title': data['title'],
2234                     'ext': ext,
2235                     'format': data['media']['mimeType'],
2236                     'thumbnail': data['thumbnailUrl'],
2237                     'description': data['description'],
2238                     'player_url': data['embedUrl'],
2239                     'user_agent': 'iTunes/10.6.1',
2240                 }
2241             except (ValueError,KeyError) as err:
2242                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2243                 return
2244
2245         return [info]
2246
2247
2248 class MyVideoIE(InfoExtractor):
2249     """Information Extractor for myvideo.de."""
2250
2251     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2252     IE_NAME = u'myvideo'
2253
2254     def __init__(self, downloader=None):
2255         InfoExtractor.__init__(self, downloader)
2256
2257     def report_extraction(self, video_id):
2258         """Report information extraction."""
2259         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2260
2261     def _real_extract(self,url):
2262         mobj = re.match(self._VALID_URL, url)
2263         if mobj is None:
2264             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2265             return
2266
2267         video_id = mobj.group(1)
2268
2269         # Get video webpage
2270         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2271         webpage = self._download_webpage(webpage_url, video_id)
2272
2273         self.report_extraction(video_id)
2274         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2275                  webpage)
2276         if mobj is None:
2277             self._downloader.trouble(u'ERROR: unable to extract media URL')
2278             return
2279         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2280
2281         mobj = re.search('<title>([^<]+)</title>', webpage)
2282         if mobj is None:
2283             self._downloader.trouble(u'ERROR: unable to extract title')
2284             return
2285
2286         video_title = mobj.group(1)
2287
2288         return [{
2289             'id':       video_id,
2290             'url':      video_url,
2291             'uploader': None,
2292             'upload_date':  None,
2293             'title':    video_title,
2294             'ext':      u'flv',
2295         }]
2296
2297 class ComedyCentralIE(InfoExtractor):
2298     """Information extractor for The Daily Show and Colbert Report """
2299
2300     # urls can be abbreviations like :thedailyshow or :colbert
2301     # urls for episodes like:
2302     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2303     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2304     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2305     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2306                       |(https?://)?(www\.)?
2307                           (?P<showname>thedailyshow|colbertnation)\.com/
2308                          (full-episodes/(?P<episode>.*)|
2309                           (?P<clip>
2310                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2311                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2312                      $"""
2313
2314     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2315
2316     _video_extensions = {
2317         '3500': 'mp4',
2318         '2200': 'mp4',
2319         '1700': 'mp4',
2320         '1200': 'mp4',
2321         '750': 'mp4',
2322         '400': 'mp4',
2323     }
2324     _video_dimensions = {
2325         '3500': '1280x720',
2326         '2200': '960x540',
2327         '1700': '768x432',
2328         '1200': '640x360',
2329         '750': '512x288',
2330         '400': '384x216',
2331     }
2332
2333     def suitable(self, url):
2334         """Receives a URL and returns True if suitable for this IE."""
2335         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2336
2337     def report_extraction(self, episode_id):
2338         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2339
2340     def report_config_download(self, episode_id, media_id):
2341         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2342
2343     def report_index_download(self, episode_id):
2344         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2345
2346     def _print_formats(self, formats):
2347         print('Available formats:')
2348         for x in formats:
2349             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2350
2351
2352     def _real_extract(self, url):
2353         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2354         if mobj is None:
2355             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2356             return
2357
2358         if mobj.group('shortname'):
2359             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2360                 url = u'http://www.thedailyshow.com/full-episodes/'
2361             else:
2362                 url = u'http://www.colbertnation.com/full-episodes/'
2363             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2364             assert mobj is not None
2365
2366         if mobj.group('clip'):
2367             if mobj.group('showname') == 'thedailyshow':
2368                 epTitle = mobj.group('tdstitle')
2369             else:
2370                 epTitle = mobj.group('cntitle')
2371             dlNewest = False
2372         else:
2373             dlNewest = not mobj.group('episode')
2374             if dlNewest:
2375                 epTitle = mobj.group('showname')
2376             else:
2377                 epTitle = mobj.group('episode')
2378
2379         req = compat_urllib_request.Request(url)
2380         self.report_extraction(epTitle)
2381         try:
2382             htmlHandle = compat_urllib_request.urlopen(req)
2383             html = htmlHandle.read()
2384             webpage = html.decode('utf-8')
2385         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2386             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2387             return
2388         if dlNewest:
2389             url = htmlHandle.geturl()
2390             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2391             if mobj is None:
2392                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2393                 return
2394             if mobj.group('episode') == '':
2395                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2396                 return
2397             epTitle = mobj.group('episode')
2398
2399         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2400
2401         if len(mMovieParams) == 0:
2402             # The Colbert Report embeds the information in a without
2403             # a URL prefix; so extract the alternate reference
2404             # and then add the URL prefix manually.
2405
2406             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2407             if len(altMovieParams) == 0:
2408                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2409                 return
2410             else:
2411                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2412
2413         uri = mMovieParams[0][1]
2414         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2415         self.report_index_download(epTitle)
2416         try:
2417             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2418         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2419             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2420             return
2421
2422         results = []
2423
2424         idoc = xml.etree.ElementTree.fromstring(indexXml)
2425         itemEls = idoc.findall('.//item')
2426         for partNum,itemEl in enumerate(itemEls):
2427             mediaId = itemEl.findall('./guid')[0].text
2428             shortMediaId = mediaId.split(':')[-1]
2429             showId = mediaId.split(':')[-2].replace('.com', '')
2430             officialTitle = itemEl.findall('./title')[0].text
2431             officialDate = itemEl.findall('./pubDate')[0].text
2432
2433             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2434                         compat_urllib_parse.urlencode({'uri': mediaId}))
2435             configReq = compat_urllib_request.Request(configUrl)
2436             self.report_config_download(epTitle, shortMediaId)
2437             try:
2438                 configXml = compat_urllib_request.urlopen(configReq).read()
2439             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2440                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2441                 return
2442
2443             cdoc = xml.etree.ElementTree.fromstring(configXml)
2444             turls = []
2445             for rendition in cdoc.findall('.//rendition'):
2446                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2447                 turls.append(finfo)
2448
2449             if len(turls) == 0:
2450                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2451                 continue
2452
2453             if self._downloader.params.get('listformats', None):
2454                 self._print_formats([i[0] for i in turls])
2455                 return
2456
2457             # For now, just pick the highest bitrate
2458             format,rtmp_video_url = turls[-1]
2459
2460             # Get the format arg from the arg stream
2461             req_format = self._downloader.params.get('format', None)
2462
2463             # Select format if we can find one
2464             for f,v in turls:
2465                 if f == req_format:
2466                     format, rtmp_video_url = f, v
2467                     break
2468
2469             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2470             if not m:
2471                 raise ExtractorError(u'Cannot transform RTMP url')
2472             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2473             video_url = base + m.group('finalid')
2474
2475             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2476             info = {
2477                 'id': shortMediaId,
2478                 'url': video_url,
2479                 'uploader': showId,
2480                 'upload_date': officialDate,
2481                 'title': effTitle,
2482                 'ext': 'mp4',
2483                 'format': format,
2484                 'thumbnail': None,
2485                 'description': officialTitle,
2486             }
2487             results.append(info)
2488
2489         return results
2490
2491
2492 class EscapistIE(InfoExtractor):
2493     """Information extractor for The Escapist """
2494
2495     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2496     IE_NAME = u'escapist'
2497
2498     def report_extraction(self, showName):
2499         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2500
2501     def report_config_download(self, showName):
2502         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2503
2504     def _real_extract(self, url):
2505         mobj = re.match(self._VALID_URL, url)
2506         if mobj is None:
2507             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2508             return
2509         showName = mobj.group('showname')
2510         videoId = mobj.group('episode')
2511
2512         self.report_extraction(showName)
2513         try:
2514             webPage = compat_urllib_request.urlopen(url)
2515             webPageBytes = webPage.read()
2516             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2517             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2518         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2519             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2520             return
2521
2522         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2523         description = unescapeHTML(descMatch.group(1))
2524         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2525         imgUrl = unescapeHTML(imgMatch.group(1))
2526         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2527         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2528         configUrlMatch = re.search('config=(.*)$', playerUrl)
2529         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2530
2531         self.report_config_download(showName)
2532         try:
2533             configJSON = compat_urllib_request.urlopen(configUrl)
2534             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2535             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2536         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2537             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2538             return
2539
2540         # Technically, it's JavaScript, not JSON
2541         configJSON = configJSON.replace("'", '"')
2542
2543         try:
2544             config = json.loads(configJSON)
2545         except (ValueError,) as err:
2546             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2547             return
2548
2549         playlist = config['playlist']
2550         videoUrl = playlist[1]['url']
2551
2552         info = {
2553             'id': videoId,
2554             'url': videoUrl,
2555             'uploader': showName,
2556             'upload_date': None,
2557             'title': showName,
2558             'ext': 'flv',
2559             'thumbnail': imgUrl,
2560             'description': description,
2561             'player_url': playerUrl,
2562         }
2563
2564         return [info]
2565
2566 class CollegeHumorIE(InfoExtractor):
2567     """Information extractor for collegehumor.com"""
2568
2569     _WORKING = False
2570     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2571     IE_NAME = u'collegehumor'
2572
2573     def report_manifest(self, video_id):
2574         """Report information extraction."""
2575         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2576
2577     def report_extraction(self, video_id):
2578         """Report information extraction."""
2579         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2580
2581     def _real_extract(self, url):
2582         mobj = re.match(self._VALID_URL, url)
2583         if mobj is None:
2584             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2585             return
2586         video_id = mobj.group('videoid')
2587
2588         info = {
2589             'id': video_id,
2590             'uploader': None,
2591             'upload_date': None,
2592         }
2593
2594         self.report_extraction(video_id)
2595         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2596         try:
2597             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2598         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2599             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2600             return
2601
2602         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2603         try:
2604             videoNode = mdoc.findall('./video')[0]
2605             info['description'] = videoNode.findall('./description')[0].text
2606             info['title'] = videoNode.findall('./caption')[0].text
2607             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2608             manifest_url = videoNode.findall('./file')[0].text
2609         except IndexError:
2610             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2611             return
2612
2613         manifest_url += '?hdcore=2.10.3'
2614         self.report_manifest(video_id)
2615         try:
2616             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2617         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2618             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2619             return
2620
2621         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2622         try:
2623             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2624             node_id = media_node.attrib['url']
2625             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2626         except IndexError as err:
2627             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2628             return
2629
2630         url_pr = compat_urllib_parse_urlparse(manifest_url)
2631         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2632
2633         info['url'] = url
2634         info['ext'] = 'f4f'
2635         return [info]
2636
2637
2638 class XVideosIE(InfoExtractor):
2639     """Information extractor for xvideos.com"""
2640
2641     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2642     IE_NAME = u'xvideos'
2643
2644     def report_extraction(self, video_id):
2645         """Report information extraction."""
2646         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2647
2648     def _real_extract(self, url):
2649         mobj = re.match(self._VALID_URL, url)
2650         if mobj is None:
2651             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2652             return
2653         video_id = mobj.group(1)
2654
2655         webpage = self._download_webpage(url, video_id)
2656
2657         self.report_extraction(video_id)
2658
2659
2660         # Extract video URL
2661         mobj = re.search(r'flv_url=(.+?)&', webpage)
2662         if mobj is None:
2663             self._downloader.trouble(u'ERROR: unable to extract video url')
2664             return
2665         video_url = compat_urllib_parse.unquote(mobj.group(1))
2666
2667
2668         # Extract title
2669         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2670         if mobj is None:
2671             self._downloader.trouble(u'ERROR: unable to extract video title')
2672             return
2673         video_title = mobj.group(1)
2674
2675
2676         # Extract video thumbnail
2677         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2678         if mobj is None:
2679             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2680             return
2681         video_thumbnail = mobj.group(0)
2682
2683         info = {
2684             'id': video_id,
2685             'url': video_url,
2686             'uploader': None,
2687             'upload_date': None,
2688             'title': video_title,
2689             'ext': 'flv',
2690             'thumbnail': video_thumbnail,
2691             'description': None,
2692         }
2693
2694         return [info]
2695
2696
2697 class SoundcloudIE(InfoExtractor):
2698     """Information extractor for soundcloud.com
2699        To access the media, the uid of the song and a stream token
2700        must be extracted from the page source and the script must make
2701        a request to media.soundcloud.com/crossdomain.xml. Then
2702        the media can be grabbed by requesting from an url composed
2703        of the stream token and uid
2704      """
2705
2706     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2707     IE_NAME = u'soundcloud'
2708
2709     def __init__(self, downloader=None):
2710         InfoExtractor.__init__(self, downloader)
2711
2712     def report_resolve(self, video_id):
2713         """Report information extraction."""
2714         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2715
2716     def report_extraction(self, video_id):
2717         """Report information extraction."""
2718         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2719
2720     def _real_extract(self, url):
2721         mobj = re.match(self._VALID_URL, url)
2722         if mobj is None:
2723             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2724             return
2725
2726         # extract uploader (which is in the url)
2727         uploader = mobj.group(1)
2728         # extract simple title (uploader + slug of song title)
2729         slug_title =  mobj.group(2)
2730         simple_title = uploader + u'-' + slug_title
2731
2732         self.report_resolve('%s/%s' % (uploader, slug_title))
2733
2734         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2735         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2736         request = compat_urllib_request.Request(resolv_url)
2737         try:
2738             info_json_bytes = compat_urllib_request.urlopen(request).read()
2739             info_json = info_json_bytes.decode('utf-8')
2740         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2741             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2742             return
2743
2744         info = json.loads(info_json)
2745         video_id = info['id']
2746         self.report_extraction('%s/%s' % (uploader, slug_title))
2747
2748         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2749         request = compat_urllib_request.Request(streams_url)
2750         try:
2751             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2752             stream_json = stream_json_bytes.decode('utf-8')
2753         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2754             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2755             return
2756
2757         streams = json.loads(stream_json)
2758         mediaURL = streams['http_mp3_128_url']
2759
2760         return [{
2761             'id':       info['id'],
2762             'url':      mediaURL,
2763             'uploader': info['user']['username'],
2764             'upload_date':  info['created_at'],
2765             'title':    info['title'],
2766             'ext':      u'mp3',
2767             'description': info['description'],
2768         }]
2769
2770
2771 class InfoQIE(InfoExtractor):
2772     """Information extractor for infoq.com"""
2773     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2774
2775     def report_extraction(self, video_id):
2776         """Report information extraction."""
2777         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2778
2779     def _real_extract(self, url):
2780         mobj = re.match(self._VALID_URL, url)
2781         if mobj is None:
2782             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2783             return
2784
2785         webpage = self._download_webpage(url, video_id=url)
2786         self.report_extraction(url)
2787
2788         # Extract video URL
2789         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2790         if mobj is None:
2791             self._downloader.trouble(u'ERROR: unable to extract video url')
2792             return
2793         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2794         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2795
2796         # Extract title
2797         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2798         if mobj is None:
2799             self._downloader.trouble(u'ERROR: unable to extract video title')
2800             return
2801         video_title = mobj.group(1)
2802
2803         # Extract description
2804         video_description = u'No description available.'
2805         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2806         if mobj is not None:
2807             video_description = mobj.group(1)
2808
2809         video_filename = video_url.split('/')[-1]
2810         video_id, extension = video_filename.split('.')
2811
2812         info = {
2813             'id': video_id,
2814             'url': video_url,
2815             'uploader': None,
2816             'upload_date': None,
2817             'title': video_title,
2818             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2819             'thumbnail': None,
2820             'description': video_description,
2821         }
2822
2823         return [info]
2824
2825 class MixcloudIE(InfoExtractor):
2826     """Information extractor for www.mixcloud.com"""
2827
2828     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2829     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2830     IE_NAME = u'mixcloud'
2831
2832     def __init__(self, downloader=None):
2833         InfoExtractor.__init__(self, downloader)
2834
2835     def report_download_json(self, file_id):
2836         """Report JSON download."""
2837         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2838
2839     def report_extraction(self, file_id):
2840         """Report information extraction."""
2841         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2842
2843     def get_urls(self, jsonData, fmt, bitrate='best'):
2844         """Get urls from 'audio_formats' section in json"""
2845         file_url = None
2846         try:
2847             bitrate_list = jsonData[fmt]
2848             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2849                 bitrate = max(bitrate_list) # select highest
2850
2851             url_list = jsonData[fmt][bitrate]
2852         except TypeError: # we have no bitrate info.
2853             url_list = jsonData[fmt]
2854         return url_list
2855
2856     def check_urls(self, url_list):
2857         """Returns 1st active url from list"""
2858         for url in url_list:
2859             try:
2860                 compat_urllib_request.urlopen(url)
2861                 return url
2862             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2863                 url = None
2864
2865         return None
2866
2867     def _print_formats(self, formats):
2868         print('Available formats:')
2869         for fmt in formats.keys():
2870             for b in formats[fmt]:
2871                 try:
2872                     ext = formats[fmt][b][0]
2873                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2874                 except TypeError: # we have no bitrate info
2875                     ext = formats[fmt][0]
2876                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2877                     break
2878
2879     def _real_extract(self, url):
2880         mobj = re.match(self._VALID_URL, url)
2881         if mobj is None:
2882             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2883             return
2884         # extract uploader & filename from url
2885         uploader = mobj.group(1).decode('utf-8')
2886         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2887
2888         # construct API request
2889         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2890         # retrieve .json file with links to files
2891         request = compat_urllib_request.Request(file_url)
2892         try:
2893             self.report_download_json(file_url)
2894             jsonData = compat_urllib_request.urlopen(request).read()
2895         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2896             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2897             return
2898
2899         # parse JSON
2900         json_data = json.loads(jsonData)
2901         player_url = json_data['player_swf_url']
2902         formats = dict(json_data['audio_formats'])
2903
2904         req_format = self._downloader.params.get('format', None)
2905         bitrate = None
2906
2907         if self._downloader.params.get('listformats', None):
2908             self._print_formats(formats)
2909             return
2910
2911         if req_format is None or req_format == 'best':
2912             for format_param in formats.keys():
2913                 url_list = self.get_urls(formats, format_param)
2914                 # check urls
2915                 file_url = self.check_urls(url_list)
2916                 if file_url is not None:
2917                     break # got it!
2918         else:
2919             if req_format not in formats:
2920                 self._downloader.trouble(u'ERROR: format is not available')
2921                 return
2922
2923             url_list = self.get_urls(formats, req_format)
2924             file_url = self.check_urls(url_list)
2925             format_param = req_format
2926
2927         return [{
2928             'id': file_id.decode('utf-8'),
2929             'url': file_url.decode('utf-8'),
2930             'uploader': uploader.decode('utf-8'),
2931             'upload_date': None,
2932             'title': json_data['name'],
2933             'ext': file_url.split('.')[-1].decode('utf-8'),
2934             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2935             'thumbnail': json_data['thumbnail_url'],
2936             'description': json_data['description'],
2937             'player_url': player_url.decode('utf-8'),
2938         }]
2939
2940 class StanfordOpenClassroomIE(InfoExtractor):
2941     """Information extractor for Stanford's Open ClassRoom"""
2942
2943     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2944     IE_NAME = u'stanfordoc'
2945
2946     def report_download_webpage(self, objid):
2947         """Report information extraction."""
2948         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2949
2950     def report_extraction(self, video_id):
2951         """Report information extraction."""
2952         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2953
2954     def _real_extract(self, url):
2955         mobj = re.match(self._VALID_URL, url)
2956         if mobj is None:
2957             raise ExtractorError(u'Invalid URL: %s' % url)
2958
2959         if mobj.group('course') and mobj.group('video'): # A specific video
2960             course = mobj.group('course')
2961             video = mobj.group('video')
2962             info = {
2963                 'id': course + '_' + video,
2964                 'uploader': None,
2965                 'upload_date': None,
2966             }
2967
2968             self.report_extraction(info['id'])
2969             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2970             xmlUrl = baseUrl + video + '.xml'
2971             try:
2972                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2973             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2974                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2975                 return
2976             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2977             try:
2978                 info['title'] = mdoc.findall('./title')[0].text
2979                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2980             except IndexError:
2981                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2982                 return
2983             info['ext'] = info['url'].rpartition('.')[2]
2984             return [info]
2985         elif mobj.group('course'): # A course page
2986             course = mobj.group('course')
2987             info = {
2988                 'id': course,
2989                 'type': 'playlist',
2990                 'uploader': None,
2991                 'upload_date': None,
2992             }
2993
2994             coursepage = self._download_webpage(url, info['id'],
2995                                         note='Downloading course info page',
2996                                         errnote='Unable to download course info page')
2997
2998             m = re.search('<h1>([^<]+)</h1>', coursepage)
2999             if m:
3000                 info['title'] = unescapeHTML(m.group(1))
3001             else:
3002                 info['title'] = info['id']
3003
3004             m = re.search('<description>([^<]+)</description>', coursepage)
3005             if m:
3006                 info['description'] = unescapeHTML(m.group(1))
3007
3008             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3009             info['list'] = [
3010                 {
3011                     'type': 'reference',
3012                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3013                 }
3014                     for vpage in links]
3015             results = []
3016             for entry in info['list']:
3017                 assert entry['type'] == 'reference'
3018                 results += self.extract(entry['url'])
3019             return results
3020         else: # Root page
3021             info = {
3022                 'id': 'Stanford OpenClassroom',
3023                 'type': 'playlist',
3024                 'uploader': None,
3025                 'upload_date': None,
3026             }
3027
3028             self.report_download_webpage(info['id'])
3029             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3030             try:
3031                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3032             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3033                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3034                 return
3035
3036             info['title'] = info['id']
3037
3038             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3039             info['list'] = [
3040                 {
3041                     'type': 'reference',
3042                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3043                 }
3044                     for cpage in links]
3045
3046             results = []
3047             for entry in info['list']:
3048                 assert entry['type'] == 'reference'
3049                 results += self.extract(entry['url'])
3050             return results
3051
3052 class MTVIE(InfoExtractor):
3053     """Information extractor for MTV.com"""
3054
3055     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3056     IE_NAME = u'mtv'
3057
3058     def report_extraction(self, video_id):
3059         """Report information extraction."""
3060         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3061
3062     def _real_extract(self, url):
3063         mobj = re.match(self._VALID_URL, url)
3064         if mobj is None:
3065             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3066             return
3067         if not mobj.group('proto'):
3068             url = 'http://' + url
3069         video_id = mobj.group('videoid')
3070
3071         webpage = self._download_webpage(url, video_id)
3072
3073         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3074         if mobj is None:
3075             self._downloader.trouble(u'ERROR: unable to extract song name')
3076             return
3077         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3078         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3079         if mobj is None:
3080             self._downloader.trouble(u'ERROR: unable to extract performer')
3081             return
3082         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3083         video_title = performer + ' - ' + song_name
3084
3085         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3086         if mobj is None:
3087             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3088             return
3089         mtvn_uri = mobj.group(1)
3090
3091         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3092         if mobj is None:
3093             self._downloader.trouble(u'ERROR: unable to extract content id')
3094             return
3095         content_id = mobj.group(1)
3096
3097         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3098         self.report_extraction(video_id)
3099         request = compat_urllib_request.Request(videogen_url)
3100         try:
3101             metadataXml = compat_urllib_request.urlopen(request).read()
3102         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3103             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3104             return
3105
3106         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3107         renditions = mdoc.findall('.//rendition')
3108
3109         # For now, always pick the highest quality.
3110         rendition = renditions[-1]
3111
3112         try:
3113             _,_,ext = rendition.attrib['type'].partition('/')
3114             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3115             video_url = rendition.find('./src').text
3116         except KeyError:
3117             self._downloader.trouble('Invalid rendition field.')
3118             return
3119
3120         info = {
3121             'id': video_id,
3122             'url': video_url,
3123             'uploader': performer,
3124             'upload_date': None,
3125             'title': video_title,
3126             'ext': ext,
3127             'format': format,
3128         }
3129
3130         return [info]
3131
3132
3133 class YoukuIE(InfoExtractor):
3134     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3135
3136     def report_download_webpage(self, file_id):
3137         """Report webpage download."""
3138         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3139
3140     def report_extraction(self, file_id):
3141         """Report information extraction."""
3142         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3143
3144     def _gen_sid(self):
3145         nowTime = int(time.time() * 1000)
3146         random1 = random.randint(1000,1998)
3147         random2 = random.randint(1000,9999)
3148
3149         return "%d%d%d" %(nowTime,random1,random2)
3150
3151     def _get_file_ID_mix_string(self, seed):
3152         mixed = []
3153         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3154         seed = float(seed)
3155         for i in range(len(source)):
3156             seed  =  (seed * 211 + 30031 ) % 65536
3157             index  =  math.floor(seed / 65536 * len(source) )
3158             mixed.append(source[int(index)])
3159             source.remove(source[int(index)])
3160         #return ''.join(mixed)
3161         return mixed
3162
3163     def _get_file_id(self, fileId, seed):
3164         mixed = self._get_file_ID_mix_string(seed)
3165         ids = fileId.split('*')
3166         realId = []
3167         for ch in ids:
3168             if ch:
3169                 realId.append(mixed[int(ch)])
3170         return ''.join(realId)
3171
3172     def _real_extract(self, url):
3173         mobj = re.match(self._VALID_URL, url)
3174         if mobj is None:
3175             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3176             return
3177         video_id = mobj.group('ID')
3178
3179         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3180
3181         request = compat_urllib_request.Request(info_url, None, std_headers)
3182         try:
3183             self.report_download_webpage(video_id)
3184             jsondata = compat_urllib_request.urlopen(request).read()
3185         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3186             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3187             return
3188
3189         self.report_extraction(video_id)
3190         try:
3191             jsonstr = jsondata.decode('utf-8')
3192             config = json.loads(jsonstr)
3193
3194             video_title =  config['data'][0]['title']
3195             seed = config['data'][0]['seed']
3196
3197             format = self._downloader.params.get('format', None)
3198             supported_format = list(config['data'][0]['streamfileids'].keys())
3199
3200             if format is None or format == 'best':
3201                 if 'hd2' in supported_format:
3202                     format = 'hd2'
3203                 else:
3204                     format = 'flv'
3205                 ext = u'flv'
3206             elif format == 'worst':
3207                 format = 'mp4'
3208                 ext = u'mp4'
3209             else:
3210                 format = 'flv'
3211                 ext = u'flv'
3212
3213
3214             fileid = config['data'][0]['streamfileids'][format]
3215             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3216         except (UnicodeDecodeError, ValueError, KeyError):
3217             self._downloader.trouble(u'ERROR: unable to extract info section')
3218             return
3219
3220         files_info=[]
3221         sid = self._gen_sid()
3222         fileid = self._get_file_id(fileid, seed)
3223
3224         #column 8,9 of fileid represent the segment number
3225         #fileid[7:9] should be changed
3226         for index, key in enumerate(keys):
3227
3228             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3229             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3230
3231             info = {
3232                 'id': '%s_part%02d' % (video_id, index),
3233                 'url': download_url,
3234                 'uploader': None,
3235                 'upload_date': None,
3236                 'title': video_title,
3237                 'ext': ext,
3238             }
3239             files_info.append(info)
3240
3241         return files_info
3242
3243
3244 class XNXXIE(InfoExtractor):
3245     """Information extractor for xnxx.com"""
3246
3247     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3248     IE_NAME = u'xnxx'
3249     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3250     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3251     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3252
3253     def report_webpage(self, video_id):
3254         """Report information extraction"""
3255         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3256
3257     def report_extraction(self, video_id):
3258         """Report information extraction"""
3259         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3260
3261     def _real_extract(self, url):
3262         mobj = re.match(self._VALID_URL, url)
3263         if mobj is None:
3264             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3265             return
3266         video_id = mobj.group(1)
3267
3268         self.report_webpage(video_id)
3269
3270         # Get webpage content
3271         try:
3272             webpage_bytes = compat_urllib_request.urlopen(url).read()
3273             webpage = webpage_bytes.decode('utf-8')
3274         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3275             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3276             return
3277
3278         result = re.search(self.VIDEO_URL_RE, webpage)
3279         if result is None:
3280             self._downloader.trouble(u'ERROR: unable to extract video url')
3281             return
3282         video_url = compat_urllib_parse.unquote(result.group(1))
3283
3284         result = re.search(self.VIDEO_TITLE_RE, webpage)
3285         if result is None:
3286             self._downloader.trouble(u'ERROR: unable to extract video title')
3287             return
3288         video_title = result.group(1)
3289
3290         result = re.search(self.VIDEO_THUMB_RE, webpage)
3291         if result is None:
3292             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3293             return
3294         video_thumbnail = result.group(1)
3295
3296         return [{
3297             'id': video_id,
3298             'url': video_url,
3299             'uploader': None,
3300             'upload_date': None,
3301             'title': video_title,
3302             'ext': 'flv',
3303             'thumbnail': video_thumbnail,
3304             'description': None,
3305         }]
3306
3307
3308 class GooglePlusIE(InfoExtractor):
3309     """Information extractor for plus.google.com."""
3310
3311     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3312     IE_NAME = u'plus.google'
3313
3314     def __init__(self, downloader=None):
3315         InfoExtractor.__init__(self, downloader)
3316
3317     def report_extract_entry(self, url):
3318         """Report downloading extry"""
3319         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3320
3321     def report_date(self, upload_date):
3322         """Report downloading extry"""
3323         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3324
3325     def report_uploader(self, uploader):
3326         """Report downloading extry"""
3327         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3328
3329     def report_title(self, video_title):
3330         """Report downloading extry"""
3331         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3332
3333     def report_extract_vid_page(self, video_page):
3334         """Report information extraction."""
3335         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3336
3337     def _real_extract(self, url):
3338         # Extract id from URL
3339         mobj = re.match(self._VALID_URL, url)
3340         if mobj is None:
3341             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3342             return
3343
3344         post_url = mobj.group(0)
3345         video_id = mobj.group(1)
3346
3347         video_extension = 'flv'
3348
3349         # Step 1, Retrieve post webpage to extract further information
3350         self.report_extract_entry(post_url)
3351         request = compat_urllib_request.Request(post_url)
3352         try:
3353             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3354         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3355             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3356             return
3357
3358         # Extract update date
3359         upload_date = None
3360         pattern = 'title="Timestamp">(.*?)</a>'
3361         mobj = re.search(pattern, webpage)
3362         if mobj:
3363             upload_date = mobj.group(1)
3364             # Convert timestring to a format suitable for filename
3365             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3366             upload_date = upload_date.strftime('%Y%m%d')
3367         self.report_date(upload_date)
3368
3369         # Extract uploader
3370         uploader = None
3371         pattern = r'rel\="author".*?>(.*?)</a>'
3372         mobj = re.search(pattern, webpage)
3373         if mobj:
3374             uploader = mobj.group(1)
3375         self.report_uploader(uploader)
3376
3377         # Extract title
3378         # Get the first line for title
3379         video_title = u'NA'
3380         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3381         mobj = re.search(pattern, webpage)
3382         if mobj:
3383             video_title = mobj.group(1)
3384         self.report_title(video_title)
3385
3386         # Step 2, Stimulate clicking the image box to launch video
3387         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3388         mobj = re.search(pattern, webpage)
3389         if mobj is None:
3390             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3391
3392         video_page = mobj.group(1)
3393         request = compat_urllib_request.Request(video_page)
3394         try:
3395             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3396         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3397             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3398             return
3399         self.report_extract_vid_page(video_page)
3400
3401
3402         # Extract video links on video page
3403         """Extract video links of all sizes"""
3404         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3405         mobj = re.findall(pattern, webpage)
3406         if len(mobj) == 0:
3407             self._downloader.trouble(u'ERROR: unable to extract video links')
3408
3409         # Sort in resolution
3410         links = sorted(mobj)
3411
3412         # Choose the lowest of the sort, i.e. highest resolution
3413         video_url = links[-1]
3414         # Only get the url. The resolution part in the tuple has no use anymore
3415         video_url = video_url[-1]
3416         # Treat escaped \u0026 style hex
3417         try:
3418             video_url = video_url.decode("unicode_escape")
3419         except AttributeError: # Python 3
3420             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3421
3422
3423         return [{
3424             'id':       video_id,
3425             'url':      video_url,
3426             'uploader': uploader,
3427             'upload_date':  upload_date,
3428             'title':    video_title,
3429             'ext':      video_extension,
3430         }]
3431
3432 class NBAIE(InfoExtractor):
3433     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3434     IE_NAME = u'nba'
3435
3436     def _real_extract(self, url):
3437         mobj = re.match(self._VALID_URL, url)
3438         if mobj is None:
3439             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3440             return
3441
3442         video_id = mobj.group(1)
3443         if video_id.endswith('/index.html'):
3444             video_id = video_id[:-len('/index.html')]
3445
3446         webpage = self._download_webpage(url, video_id)
3447
3448         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3449         def _findProp(rexp, default=None):
3450             m = re.search(rexp, webpage)
3451             if m:
3452                 return unescapeHTML(m.group(1))
3453             else:
3454                 return default
3455
3456         shortened_video_id = video_id.rpartition('/')[2]
3457         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3458         info = {
3459             'id': shortened_video_id,
3460             'url': video_url,
3461             'ext': 'mp4',
3462             'title': title,
3463             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3464             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3465         }
3466         return [info]
3467
3468 class JustinTVIE(InfoExtractor):
3469     """Information extractor for justin.tv and twitch.tv"""
3470     # TODO: One broadcast may be split into multiple videos. The key
3471     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3472     # starts at 1 and increases. Can we treat all parts as one video?
3473
3474     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3475         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3476     _JUSTIN_PAGE_LIMIT = 100
3477     IE_NAME = u'justin.tv'
3478
3479     def report_extraction(self, file_id):
3480         """Report information extraction."""
3481         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3482
3483     def report_download_page(self, channel, offset):
3484         """Report attempt to download a single page of videos."""
3485         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3486                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3487
3488     # Return count of items, list of *valid* items
3489     def _parse_page(self, url):
3490         try:
3491             urlh = compat_urllib_request.urlopen(url)
3492             webpage_bytes = urlh.read()
3493             webpage = webpage_bytes.decode('utf-8', 'ignore')
3494         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3495             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3496             return
3497
3498         response = json.loads(webpage)
3499         if type(response) != list:
3500             error_text = response.get('error', 'unknown error')
3501             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3502             return
3503         info = []
3504         for clip in response:
3505             video_url = clip['video_file_url']
3506             if video_url:
3507                 video_extension = os.path.splitext(video_url)[1][1:]
3508                 video_date = re.sub('-', '', clip['start_time'][:10])
3509                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3510                 video_id = clip['id']
3511                 video_title = clip.get('title', video_id)
3512                 info.append({
3513                     'id': video_id,
3514                     'url': video_url,
3515                     'title': video_title,
3516                     'uploader': clip.get('channel_name', video_uploader_id),
3517                     'uploader_id': video_uploader_id,
3518                     'upload_date': video_date,
3519                     'ext': video_extension,
3520                 })
3521         return (len(response), info)
3522
3523     def _real_extract(self, url):
3524         mobj = re.match(self._VALID_URL, url)
3525         if mobj is None:
3526             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3527             return
3528
3529         api = 'http://api.justin.tv'
3530         video_id = mobj.group(mobj.lastindex)
3531         paged = False
3532         if mobj.lastindex == 1:
3533             paged = True
3534             api += '/channel/archives/%s.json'
3535         else:
3536             api += '/broadcast/by_archive/%s.json'
3537         api = api % (video_id,)
3538
3539         self.report_extraction(video_id)
3540
3541         info = []
3542         offset = 0
3543         limit = self._JUSTIN_PAGE_LIMIT
3544         while True:
3545             if paged:
3546                 self.report_download_page(video_id, offset)
3547             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3548             page_count, page_info = self._parse_page(page_url)
3549             info.extend(page_info)
3550             if not paged or page_count != limit:
3551                 break
3552             offset += limit
3553         return info
3554
3555 class FunnyOrDieIE(InfoExtractor):
3556     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3557
3558     def _real_extract(self, url):
3559         mobj = re.match(self._VALID_URL, url)
3560         if mobj is None:
3561             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3562             return
3563
3564         video_id = mobj.group('id')
3565         webpage = self._download_webpage(url, video_id)
3566
3567         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3568         if not m:
3569             self._downloader.trouble(u'ERROR: unable to find video information')
3570         video_url = unescapeHTML(m.group('url'))
3571
3572         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3573         if not m:
3574             self._downloader.trouble(u'Cannot find video title')
3575         title = unescapeHTML(m.group('title'))
3576
3577         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3578         if m:
3579             desc = unescapeHTML(m.group('desc'))
3580         else:
3581             desc = None
3582
3583         info = {
3584             'id': video_id,
3585             'url': video_url,
3586             'ext': 'mp4',
3587             'title': title,
3588             'description': desc,
3589         }
3590         return [info]
3591
3592 class TweetReelIE(InfoExtractor):
3593     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3594
3595     def _real_extract(self, url):
3596         mobj = re.match(self._VALID_URL, url)
3597         if mobj is None:
3598             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3599             return
3600
3601         video_id = mobj.group('id')
3602         webpage = self._download_webpage(url, video_id)
3603
3604         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3605         if not m:
3606             self._downloader.trouble(u'ERROR: Cannot find status ID')
3607         status_id = m.group(1)
3608
3609         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3610         if not m:
3611             self._downloader.trouble(u'WARNING: Cannot find description')
3612         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3613
3614         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3615         if not m:
3616             self._downloader.trouble(u'ERROR: Cannot find uploader')
3617         uploader = unescapeHTML(m.group('uploader'))
3618         uploader_id = unescapeHTML(m.group('uploader_id'))
3619
3620         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3621         if not m:
3622             self._downloader.trouble(u'ERROR: Cannot find upload date')
3623         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3624
3625         title = desc
3626         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3627
3628         info = {
3629             'id': video_id,
3630             'url': video_url,
3631             'ext': 'mov',
3632             'title': title,
3633             'description': desc,
3634             'uploader': uploader,
3635             'uploader_id': uploader_id,
3636             'internal_id': status_id,
3637             'upload_date': upload_date
3638         }
3639         return [info]
3640
3641 class SteamIE(InfoExtractor):
3642     _VALID_URL = r"""http://store.steampowered.com/
3643                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3644                 (?P<gameID>\d+)/?
3645                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3646                 """
3647
3648     def suitable(self, url):
3649         """Receives a URL and returns True if suitable for this IE."""
3650         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3651
3652     def _real_extract(self, url):
3653         m = re.match(self._VALID_URL, url, re.VERBOSE)
3654         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3655         gameID = m.group('gameID')
3656         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3657         webpage = self._download_webpage(videourl, gameID)
3658         mweb = re.finditer(urlRE, webpage)
3659         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3660         titles = re.finditer(namesRE, webpage)
3661         videos = []
3662         for vid,vtitle in zip(mweb,titles):
3663             video_id = vid.group('videoID')
3664             title = vtitle.group('videoName')
3665             video_url = vid.group('videoURL')
3666             if not video_url:
3667                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3668             info = {
3669                 'id':video_id,
3670                 'url':video_url,
3671                 'ext': 'flv',
3672                 'title': unescapeHTML(title)
3673                   }
3674             videos.append(info)
3675         return videos
3676
3677 class UstreamIE(InfoExtractor):
3678     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3679     IE_NAME = u'ustream'
3680
3681     def _real_extract(self, url):
3682         m = re.match(self._VALID_URL, url)
3683         video_id = m.group('videoID')
3684         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3685         webpage = self._download_webpage(url, video_id)
3686         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3687         title = m.group('title')
3688         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3689         uploader = m.group('uploader')
3690         info = {
3691                 'id':video_id,
3692                 'url':video_url,
3693                 'ext': 'flv',
3694                 'title': title,
3695                 'uploader': uploader
3696                   }
3697         return [info]
3698
3699 class RBMARadioIE(InfoExtractor):
3700     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3701
3702     def _real_extract(self, url):
3703         m = re.match(self._VALID_URL, url)
3704         video_id = m.group('videoID')
3705
3706         webpage = self._download_webpage(url, video_id)
3707         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3708         if not m:
3709             raise ExtractorError(u'Cannot find metadata')
3710         json_data = m.group(1)
3711
3712         try:
3713             data = json.loads(json_data)
3714         except ValueError as e:
3715             raise ExtractorError(u'Invalid JSON: ' + str(e))
3716
3717         video_url = data['akamai_url'] + '&cbr=256'
3718         url_parts = compat_urllib_parse_urlparse(video_url)
3719         video_ext = url_parts.path.rpartition('.')[2]
3720         info = {
3721                 'id': video_id,
3722                 'url': video_url,
3723                 'ext': video_ext,
3724                 'title': data['title'],
3725                 'description': data.get('teaser_text'),
3726                 'location': data.get('country_of_origin'),
3727                 'uploader': data.get('host', {}).get('name'),
3728                 'uploader_id': data.get('host', {}).get('slug'),
3729                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3730                 'duration': data.get('duration'),
3731         }
3732         return [info]
3733
3734
3735 class YouPornIE(InfoExtractor):
3736     """Information extractor for youporn.com."""
3737     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3738
3739     def _print_formats(self, formats):
3740         """Print all available formats"""
3741         print(u'Available formats:')
3742         print(u'ext\t\tformat')
3743         print(u'---------------------------------')
3744         for format in formats:
3745             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3746
3747     def _specific(self, req_format, formats):
3748         for x in formats:
3749             if(x["format"]==req_format):
3750                 return x
3751         return None
3752
3753     def _real_extract(self, url):
3754         mobj = re.match(self._VALID_URL, url)
3755         if mobj is None:
3756             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3757             return
3758
3759         video_id = mobj.group('videoid')
3760
3761         req = compat_urllib_request.Request(url)
3762         req.add_header('Cookie', 'age_verified=1')
3763         webpage = self._download_webpage(req, video_id)
3764
3765         # Get the video title
3766         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3767         if result is None:
3768             raise ExtractorError(u'Unable to extract video title')
3769         video_title = result.group('title').strip()
3770
3771         # Get the video date
3772         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3773         if result is None:
3774             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3775             upload_date = None
3776         else:
3777             upload_date = result.group('date').strip()
3778
3779         # Get the video uploader
3780         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3781         if result is None:
3782             self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3783             video_uploader = None
3784         else:
3785             video_uploader = result.group('uploader').strip()
3786             video_uploader = clean_html( video_uploader )
3787
3788         # Get all of the formats available
3789         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3790         result = re.search(DOWNLOAD_LIST_RE, webpage)
3791         if result is None:
3792             raise ExtractorError(u'Unable to extract download list')
3793         download_list_html = result.group('download_list').strip()
3794
3795         # Get all of the links from the page
3796         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3797         links = re.findall(LINK_RE, download_list_html)
3798         if(len(links) == 0):
3799             raise ExtractorError(u'ERROR: no known formats available for video')
3800
3801         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3802
3803         formats = []
3804         for link in links:
3805
3806             # A link looks like this:
3807             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3808             # A path looks like this:
3809             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3810             video_url = unescapeHTML( link )
3811             path = compat_urllib_parse_urlparse( video_url ).path
3812             extension = os.path.splitext( path )[1][1:]
3813             format = path.split('/')[4].split('_')[:2]
3814             size = format[0]
3815             bitrate = format[1]
3816             format = "-".join( format )
3817             title = u'%s-%s-%s' % (video_title, size, bitrate)
3818
3819             formats.append({
3820                 'id': video_id,
3821                 'url': video_url,
3822                 'uploader': video_uploader,
3823                 'upload_date': upload_date,
3824                 'title': title,
3825                 'ext': extension,
3826                 'format': format,
3827                 'thumbnail': None,
3828                 'description': None,
3829                 'player_url': None
3830             })
3831
3832         if self._downloader.params.get('listformats', None):
3833             self._print_formats(formats)
3834             return
3835
3836         req_format = self._downloader.params.get('format', None)
3837         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3838
3839         if req_format is None or req_format == 'best':
3840             return [formats[0]]
3841         elif req_format == 'worst':
3842             return [formats[-1]]
3843         elif req_format in ('-1', 'all'):
3844             return formats
3845         else:
3846             format = self._specific( req_format, formats )
3847             if result is None:
3848                 self._downloader.trouble(u'ERROR: requested format not available')
3849                 return
3850             return [format]
3851
3852
3853
3854 class PornotubeIE(InfoExtractor):
3855     """Information extractor for pornotube.com."""
3856     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3857
3858     def _real_extract(self, url):
3859         mobj = re.match(self._VALID_URL, url)
3860         if mobj is None:
3861             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3862             return
3863
3864         video_id = mobj.group('videoid')
3865         video_title = mobj.group('title')
3866
3867         # Get webpage content
3868         webpage = self._download_webpage(url, video_id)
3869
3870         # Get the video URL
3871         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3872         result = re.search(VIDEO_URL_RE, webpage)
3873         if result is None:
3874             self._downloader.trouble(u'ERROR: unable to extract video url')
3875             return
3876         video_url = compat_urllib_parse.unquote(result.group('url'))
3877
3878         #Get the uploaded date
3879         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3880         result = re.search(VIDEO_UPLOADED_RE, webpage)
3881         if result is None:
3882             self._downloader.trouble(u'ERROR: unable to extract video title')
3883             return
3884         upload_date = result.group('date')
3885
3886         info = {'id': video_id,
3887                 'url': video_url,
3888                 'uploader': None,
3889                 'upload_date': upload_date,
3890                 'title': video_title,
3891                 'ext': 'flv',
3892                 'format': 'flv'}
3893
3894         return [info]
3895
3896 class YouJizzIE(InfoExtractor):
3897     """Information extractor for youjizz.com."""
3898     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3899
3900     def _real_extract(self, url):
3901         mobj = re.match(self._VALID_URL, url)
3902         if mobj is None:
3903             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3904             return
3905
3906         video_id = mobj.group('videoid')
3907
3908         # Get webpage content
3909         webpage = self._download_webpage(url, video_id)
3910
3911         # Get the video title
3912         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3913         if result is None:
3914             raise ExtractorError(u'ERROR: unable to extract video title')
3915         video_title = result.group('title').strip()
3916
3917         # Get the embed page
3918         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3919         if result is None:
3920             raise ExtractorError(u'ERROR: unable to extract embed page')
3921
3922         embed_page_url = result.group(0).strip()
3923         video_id = result.group('videoid')
3924
3925         webpage = self._download_webpage(embed_page_url, video_id)
3926
3927         # Get the video URL
3928         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3929         if result is None:
3930             raise ExtractorError(u'ERROR: unable to extract video url')
3931         video_url = result.group('source')
3932
3933         info = {'id': video_id,
3934                 'url': video_url,
3935                 'title': video_title,
3936                 'ext': 'flv',
3937                 'format': 'flv',
3938                 'player_url': embed_page_url}
3939
3940         return [info]
3941
3942 class EightTracksIE(InfoExtractor):
3943     IE_NAME = '8tracks'
3944     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3945
3946     def _real_extract(self, url):
3947         mobj = re.match(self._VALID_URL, url)
3948         if mobj is None:
3949             raise ExtractorError(u'Invalid URL: %s' % url)
3950         playlist_id = mobj.group('id')
3951
3952         webpage = self._download_webpage(url, playlist_id)
3953
3954         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3955         if not m:
3956             raise ExtractorError(u'Cannot find trax information')
3957         json_like = m.group(1)
3958         data = json.loads(json_like)
3959
3960         session = str(random.randint(0, 1000000000))
3961         mix_id = data['id']
3962         track_count = data['tracks_count']
3963         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3964         next_url = first_url
3965         res = []
3966         for i in itertools.count():
3967             api_json = self._download_webpage(next_url, playlist_id,
3968                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3969                 errnote=u'Failed to download song information')
3970             api_data = json.loads(api_json)
3971             track_data = api_data[u'set']['track']
3972             info = {
3973                 'id': track_data['id'],
3974                 'url': track_data['track_file_stream_url'],
3975                 'title': track_data['performer'] + u' - ' + track_data['name'],
3976                 'raw_title': track_data['name'],
3977                 'uploader_id': data['user']['login'],
3978                 'ext': 'm4a',
3979             }
3980             res.append(info)
3981             if api_data['set']['at_last_track']:
3982                 break
3983             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3984         return res
3985
3986 class KeekIE(InfoExtractor):
3987     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3988     IE_NAME = u'keek'
3989
3990     def _real_extract(self, url):
3991         m = re.match(self._VALID_URL, url)
3992         video_id = m.group('videoID')
3993         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3994         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3995         webpage = self._download_webpage(url, video_id)
3996         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3997         title = unescapeHTML(m.group('title'))
3998         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3999         uploader = unescapeHTML(m.group('uploader'))
4000         info = {
4001                 'id':video_id,
4002                 'url':video_url,
4003                 'ext': 'mp4',
4004                 'title': title,
4005                 'thumbnail': thumbnail,
4006                 'uploader': uploader
4007         }
4008         return [info]
4009
4010 class TEDIE(InfoExtractor):
4011     _VALID_URL=r'''http://www.ted.com/
4012                    (
4013                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4014                         |
4015                         ((?P<type_talk>talks)) # We have a simple talk
4016                    )
4017                    /(?P<name>\w+) # Here goes the name and then ".html"
4018                    '''
4019
4020     def suitable(self, url):
4021         """Receives a URL and returns True if suitable for this IE."""
4022         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
4023
4024     def _real_extract(self, url):
4025         m=re.match(self._VALID_URL, url, re.VERBOSE)
4026         if m.group('type_talk'):
4027             return [self._talk_info(url)]
4028         else :
4029             playlist_id=m.group('playlist_id')
4030             name=m.group('name')
4031             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4032             return self._playlist_videos_info(url,name,playlist_id)
4033
4034     def _talk_video_link(self,mediaSlug):
4035         '''Returns the video link for that mediaSlug'''
4036         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4037
4038     def _playlist_videos_info(self,url,name,playlist_id=0):
4039         '''Returns the videos of the playlist'''
4040         video_RE=r'''
4041                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4042                      ([.\s]*?)data-playlist_item_id="(\d+)"
4043                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4044                      '''
4045         video_name_RE=r'<p\ class="talk-title"><a href="/talks/(.+).html">(?P<fullname>.+?)</a></p>'
4046         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4047         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4048         m_names=re.finditer(video_name_RE,webpage)
4049         info=[]
4050         for m_video, m_name in zip(m_videos,m_names):
4051             video_dic={
4052                        'id': m_video.group('video_id'),
4053                        'url': self._talk_video_link(m_video.group('mediaSlug')),
4054                        'ext': 'mp4',
4055                        'title': m_name.group('fullname')
4056                        }
4057             info.append(video_dic)
4058         return info
4059     def _talk_info(self, url, video_id=0):
4060         """Return the video for the talk in the url"""
4061         m=re.match(self._VALID_URL, url,re.VERBOSE)
4062         videoName=m.group('name')
4063         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4064         # If the url includes the language we get the title translated
4065         title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>'
4066         title=re.search(title_RE, webpage).group('title')
4067         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4068                         "id":(?P<videoID>[\d]+).*?
4069                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4070         info_match=re.search(info_RE,webpage,re.VERBOSE)
4071         video_id=info_match.group('videoID')
4072         mediaSlug=info_match.group('mediaSlug')
4073         video_url=self._talk_video_link(mediaSlug)
4074         info = {
4075                 'id': video_id,
4076                 'url': video_url,
4077                 'ext': 'mp4',
4078                 'title': title
4079                 }
4080         return info
4081
4082 class MySpassIE(InfoExtractor):
4083     _VALID_URL = r'http://www.myspass.de/.*'
4084
4085     def _real_extract(self, url):
4086         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4087
4088         # video id is the last path element of the URL
4089         # usually there is a trailing slash, so also try the second but last
4090         url_path = compat_urllib_parse_urlparse(url).path
4091         url_parent_path, video_id = os.path.split(url_path)
4092         if not video_id:
4093             _, video_id = os.path.split(url_parent_path)
4094
4095         # get metadata
4096         metadata_url = META_DATA_URL_TEMPLATE % video_id
4097         metadata_text = self._download_webpage(metadata_url, video_id)
4098         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4099
4100         # extract values from metadata
4101         url_flv_el = metadata.find('url_flv')
4102         if url_flv_el is None:
4103             self._downloader.trouble(u'ERROR: unable to extract download url')
4104             return
4105         video_url = url_flv_el.text
4106         extension = os.path.splitext(video_url)[1][1:]
4107         title_el = metadata.find('title')
4108         if title_el is None:
4109             self._downloader.trouble(u'ERROR: unable to extract title')
4110             return
4111         title = title_el.text
4112         format_id_el = metadata.find('format_id')
4113         if format_id_el is None:
4114             format = ext
4115         else:
4116             format = format_id_el.text
4117         description_el = metadata.find('description')
4118         if description_el is not None:
4119             description = description_el.text
4120         else:
4121             description = None
4122         imagePreview_el = metadata.find('imagePreview')
4123         if imagePreview_el is not None:
4124             thumbnail = imagePreview_el.text
4125         else:
4126             thumbnail = None
4127         info = {
4128             'id': video_id,
4129             'url': video_url,
4130             'title': title,
4131             'ext': extension,
4132             'format': format,
4133             'thumbnail': thumbnail,
4134             'description': description
4135         }
4136         return [info]
4137
4138 def gen_extractors():
4139     """ Return a list of an instance of every supported extractor.
4140     The order does matter; the first extractor matched is the one handling the URL.
4141     """
4142     return [
4143         YoutubePlaylistIE(),
4144         YoutubeChannelIE(),
4145         YoutubeUserIE(),
4146         YoutubeSearchIE(),
4147         YoutubeIE(),
4148         MetacafeIE(),
4149         DailymotionIE(),
4150         GoogleSearchIE(),
4151         PhotobucketIE(),
4152         YahooIE(),
4153         YahooSearchIE(),
4154         DepositFilesIE(),
4155         FacebookIE(),
4156         BlipTVUserIE(),
4157         BlipTVIE(),
4158         VimeoIE(),
4159         MyVideoIE(),
4160         ComedyCentralIE(),
4161         EscapistIE(),
4162         CollegeHumorIE(),
4163         XVideosIE(),
4164         SoundcloudIE(),
4165         InfoQIE(),
4166         MixcloudIE(),
4167         StanfordOpenClassroomIE(),
4168         MTVIE(),
4169         YoukuIE(),
4170         XNXXIE(),
4171         YouJizzIE(),
4172         PornotubeIE(),
4173         YouPornIE(),
4174         GooglePlusIE(),
4175         ArteTvIE(),
4176         NBAIE(),
4177         JustinTVIE(),
4178         FunnyOrDieIE(),
4179         TweetReelIE(),
4180         SteamIE(),
4181         UstreamIE(),
4182         RBMARadioIE(),
4183         EightTracksIE(),
4184         KeekIE(),
4185         TEDIE(),
4186         MySpassIE(),
4187         GenericIE()
4188     ]
4189
4190