youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             note = u'Downloading video webpage'
 118         if note is not False:
 119             self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns the data of the page as a string """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self._downloader.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         return webpage_bytes.decode(encoding, 'replace')
 146
 147     #Methods for following #608
 148     #They set the correct value of the '_type' key
 149     def video_result(self, video_info):
 150         """Returns a video"""
 151         video_info['_type'] = 'video'
 152         return video_info
 153     def url_result(self, url, ie=None):
 154         """Returns a url that points to a page that should be processed"""
 155         #TODO: ie should be the class used for getting the info
 156         video_info = {'_type': 'url',
 157                       'url': url}
 158         return video_info
 159     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 160         """Returns a playlist"""
 161         video_info = {'_type': 'playlist',
 162                       'entries': entries}
 163         if playlist_id:
 164             video_info['id'] = playlist_id
 165         if playlist_title:
 166             video_info['title'] = playlist_title
 167         return video_info
 168
 169
 170 class YoutubeIE(InfoExtractor):
 171     """Information extractor for youtube.com."""
 172
 173     _VALID_URL = r"""^
 174                      (
 175                          (?:https?://)?                                       # http(s):// (optional)
 176                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 177                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 178                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 179                          (?:                                                  # the various things that can precede the ID:
 180                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 181                              |(?:                                             # or the v= param in all its forms
 182                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 183                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 184                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 185                                  v=
 186                              )
 187                          )?                                                   # optional -> youtube.com/xxxx is OK
 188                      )?                                                       # all until now is optional -> you can pass the naked ID
 189                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 190                      (?(1).+)?                                                # if we found the ID, everything can follow
 191                      $"""
 192     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 193     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 194     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 195     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 196     _NETRC_MACHINE = 'youtube'
 197     # Listed in order of quality
 198     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 199     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 200     _video_extensions = {
 201         '13': '3gp',
 202         '17': 'mp4',
 203         '18': 'mp4',
 204         '22': 'mp4',
 205         '37': 'mp4',
 206         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 207         '43': 'webm',
 208         '44': 'webm',
 209         '45': 'webm',
 210         '46': 'webm',
 211     }
 212     _video_dimensions = {
 213         '5': '240x400',
 214         '6': '???',
 215         '13': '???',
 216         '17': '144x176',
 217         '18': '360x640',
 218         '22': '720x1280',
 219         '34': '360x640',
 220         '35': '480x854',
 221         '37': '1080x1920',
 222         '38': '3072x4096',
 223         '43': '360x640',
 224         '44': '480x854',
 225         '45': '720x1280',
 226         '46': '1080x1920',
 227     }
 228     IE_NAME = u'youtube'
 229
 230     @classmethod
 231     def suitable(cls, url):
 232         """Receives a URL and returns True if suitable for this IE."""
 233         if YoutubePlaylistIE.suitable(url): return False
 234         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 235
 236     def report_lang(self):
 237         """Report attempt to set language."""
 238         self._downloader.to_screen(u'[youtube] Setting language')
 239
 240     def report_login(self):
 241         """Report attempt to log in."""
 242         self._downloader.to_screen(u'[youtube] Logging in')
 243
 244     def report_age_confirmation(self):
 245         """Report attempt to confirm age."""
 246         self._downloader.to_screen(u'[youtube] Confirming age')
 247
 248     def report_video_webpage_download(self, video_id):
 249         """Report attempt to download video webpage."""
 250         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 251
 252     def report_video_info_webpage_download(self, video_id):
 253         """Report attempt to download video info webpage."""
 254         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 255
 256     def report_video_subtitles_download(self, video_id):
 257         """Report attempt to download video info webpage."""
 258         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
 259
 260     def report_video_subtitles_request(self, video_id, sub_lang, format):
 261         """Report attempt to download video info webpage."""
 262         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 263
 264     def report_video_subtitles_available(self, video_id, sub_lang_list):
 265         """Report available subtitles."""
 266         sub_lang = ",".join(list(sub_lang_list.keys()))
 267         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
 268
 269     def report_information_extraction(self, video_id):
 270         """Report attempt to extract video information."""
 271         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 272
 273     def report_unavailable_format(self, video_id, format):
 274         """Report extracted video URL."""
 275         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 276
 277     def report_rtmp_download(self):
 278         """Indicate the download will use the RTMP protocol."""
 279         self._downloader.to_screen(u'[youtube] RTMP download detected')
 280
 281     def _get_available_subtitles(self, video_id):
 282         self.report_video_subtitles_download(video_id)
 283         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 284         try:
 285             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 286         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 287             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 288         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 289         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 290         if not sub_lang_list:
 291             return (u'video doesn\'t have subtitles', None)
 292         return sub_lang_list
 293
 294     def _list_available_subtitles(self, video_id):
 295         sub_lang_list = self._get_available_subtitles(video_id)
 296         self.report_video_subtitles_available(video_id, sub_lang_list)
 297
 298     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 299         """
 300         Return tuple:
 301         (error_message, sub_lang, sub)
 302         """
 303         self.report_video_subtitles_request(video_id, sub_lang, format)
 304         params = compat_urllib_parse.urlencode({
 305             'lang': sub_lang,
 306             'name': sub_name,
 307             'v': video_id,
 308             'fmt': format,
 309         })
 310         url = 'http://www.youtube.com/api/timedtext?' + params
 311         try:
 312             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 313         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 314             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 315         if not sub:
 316             return (u'Did not fetch video subtitles', None, None)
 317         return (None, sub_lang, sub)
 318
 319     def _extract_subtitle(self, video_id):
 320         """
 321         Return a list with a tuple:
 322         [(error_message, sub_lang, sub)]
 323         """
 324         sub_lang_list = self._get_available_subtitles(video_id)
 325         sub_format = self._downloader.params.get('subtitlesformat')
 326         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 327             return [(sub_lang_list[0], None, None)]
 328         if self._downloader.params.get('subtitleslang', False):
 329             sub_lang = self._downloader.params.get('subtitleslang')
 330         elif 'en' in sub_lang_list:
 331             sub_lang = 'en'
 332         else:
 333             sub_lang = list(sub_lang_list.keys())[0]
 334         if not sub_lang in sub_lang_list:
 335             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 336
 337         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 338         return [subtitle]
 339
 340     def _extract_all_subtitles(self, video_id):
 341         sub_lang_list = self._get_available_subtitles(video_id)
 342         sub_format = self._downloader.params.get('subtitlesformat')
 343         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 344             return [(sub_lang_list[0], None, None)]
 345         subtitles = []
 346         for sub_lang in sub_lang_list:
 347             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 348             subtitles.append(subtitle)
 349         return subtitles
 350
 351     def _print_formats(self, formats):
 352         print('Available formats:')
 353         for x in formats:
 354             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 355
 356     def _real_initialize(self):
 357         if self._downloader is None:
 358             return
 359
 360         username = None
 361         password = None
 362         downloader_params = self._downloader.params
 363
 364         # Attempt to use provided username and password or .netrc data
 365         if downloader_params.get('username', None) is not None:
 366             username = downloader_params['username']
 367             password = downloader_params['password']
 368         elif downloader_params.get('usenetrc', False):
 369             try:
 370                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 371                 if info is not None:
 372                     username = info[0]
 373                     password = info[2]
 374                 else:
 375                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 376             except (IOError, netrc.NetrcParseError) as err:
 377                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 378                 return
 379
 380         # Set language
 381         request = compat_urllib_request.Request(self._LANG_URL)
 382         try:
 383             self.report_lang()
 384             compat_urllib_request.urlopen(request).read()
 385         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 386             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 387             return
 388
 389         # No authentication to be performed
 390         if username is None:
 391             return
 392
 393         request = compat_urllib_request.Request(self._LOGIN_URL)
 394         try:
 395             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 396         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 397             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 398             return
 399
 400         galx = None
 401         dsh = None
 402         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 403         if match:
 404           galx = match.group(1)
 405
 406         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 407         if match:
 408           dsh = match.group(1)
 409
 410         # Log in
 411         login_form_strs = {
 412                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 413                 u'Email': username,
 414                 u'GALX': galx,
 415                 u'Passwd': password,
 416                 u'PersistentCookie': u'yes',
 417                 u'_utf8': u'霱',
 418                 u'bgresponse': u'js_disabled',
 419                 u'checkConnection': u'',
 420                 u'checkedDomains': u'youtube',
 421                 u'dnConn': u'',
 422                 u'dsh': dsh,
 423                 u'pstMsg': u'0',
 424                 u'rmShown': u'1',
 425                 u'secTok': u'',
 426                 u'signIn': u'Sign in',
 427                 u'timeStmp': u'',
 428                 u'service': u'youtube',
 429                 u'uilel': u'3',
 430                 u'hl': u'en_US',
 431         }
 432         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 433         # chokes on unicode
 434         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 435         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 436         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 437         try:
 438             self.report_login()
 439             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 440             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 441                 self._downloader.report_warning(u'unable to log in: bad username or password')
 442                 return
 443         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 444             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 445             return
 446
 447         # Confirm age
 448         age_form = {
 449                 'next_url':     '/',
 450                 'action_confirm':   'Confirm',
 451                 }
 452         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 453         try:
 454             self.report_age_confirmation()
 455             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 457             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 458             return
 459
 460     def _extract_id(self, url):
 461         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 462         if mobj is None:
 463             self._downloader.report_error(u'invalid URL: %s' % url)
 464             return
 465         video_id = mobj.group(2)
 466         return video_id
 467
 468     def _real_extract(self, url):
 469         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 470         mobj = re.search(self._NEXT_URL_RE, url)
 471         if mobj:
 472             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 473         video_id = self._extract_id(url)
 474
 475         # Get video webpage
 476         self.report_video_webpage_download(video_id)
 477         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 478         request = compat_urllib_request.Request(url)
 479         try:
 480             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 481         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 482             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 483             return
 484
 485         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 486
 487         # Attempt to extract SWF player URL
 488         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 489         if mobj is not None:
 490             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 491         else:
 492             player_url = None
 493
 494         # Get video info
 495         self.report_video_info_webpage_download(video_id)
 496         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 497             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 498                     % (video_id, el_type))
 499             video_info_webpage = self._download_webpage(video_info_url, video_id,
 500                                     note=False,
 501                                     errnote='unable to download video info webpage')
 502             video_info = compat_parse_qs(video_info_webpage)
 503             if 'token' in video_info:
 504                 break
 505         if 'token' not in video_info:
 506             if 'reason' in video_info:
 507                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 508             else:
 509                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 510             return
 511
 512         # Check for "rental" videos
 513         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 514             self._downloader.report_error(u'"rental" videos not supported')
 515             return
 516
 517         # Start extracting information
 518         self.report_information_extraction(video_id)
 519
 520         # uploader
 521         if 'author' not in video_info:
 522             self._downloader.report_error(u'unable to extract uploader name')
 523             return
 524         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 525
 526         # uploader_id
 527         video_uploader_id = None
 528         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 529         if mobj is not None:
 530             video_uploader_id = mobj.group(1)
 531         else:
 532             self._downloader.report_warning(u'unable to extract uploader nickname')
 533
 534         # title
 535         if 'title' not in video_info:
 536             self._downloader.report_error(u'unable to extract video title')
 537             return
 538         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 539
 540         # thumbnail image
 541         if 'thumbnail_url' not in video_info:
 542             self._downloader.report_warning(u'unable to extract video thumbnail')
 543             video_thumbnail = ''
 544         else:   # don't panic if we can't find it
 545             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 546
 547         # upload date
 548         upload_date = None
 549         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 550         if mobj is not None:
 551             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 552             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 553             for expression in format_expressions:
 554                 try:
 555                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 556                 except:
 557                     pass
 558
 559         # description
 560         video_description = get_element_by_id("eow-description", video_webpage)
 561         if video_description:
 562             video_description = clean_html(video_description)
 563         else:
 564             video_description = ''
 565
 566         # subtitles
 567         video_subtitles = None
 568
 569         if self._downloader.params.get('writesubtitles', False):
 570             video_subtitles = self._extract_subtitle(video_id)
 571             if video_subtitles:
 572                 (sub_error, sub_lang, sub) = video_subtitles[0]
 573                 if sub_error:
 574                     self._downloader.report_error(sub_error)
 575
 576         if self._downloader.params.get('allsubtitles', False):
 577             video_subtitles = self._extract_all_subtitles(video_id)
 578             for video_subtitle in video_subtitles:
 579                 (sub_error, sub_lang, sub) = video_subtitle
 580                 if sub_error:
 581                     self._downloader.report_error(sub_error)
 582
 583         if self._downloader.params.get('listsubtitles', False):
 584             sub_lang_list = self._list_available_subtitles(video_id)
 585             return
 586
 587         if 'length_seconds' not in video_info:
 588             self._downloader.report_warning(u'unable to extract video duration')
 589             video_duration = ''
 590         else:
 591             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 592
 593         # token
 594         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 595
 596         # Decide which formats to download
 597         req_format = self._downloader.params.get('format', None)
 598
 599         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 600             self.report_rtmp_download()
 601             video_url_list = [(None, video_info['conn'][0])]
 602         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 603             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 604             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 605             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 606             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 607
 608             format_limit = self._downloader.params.get('format_limit', None)
 609             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 610             if format_limit is not None and format_limit in available_formats:
 611                 format_list = available_formats[available_formats.index(format_limit):]
 612             else:
 613                 format_list = available_formats
 614             existing_formats = [x for x in format_list if x in url_map]
 615             if len(existing_formats) == 0:
 616                 self._downloader.report_error(u'no known formats available for video')
 617                 return
 618             if self._downloader.params.get('listformats', None):
 619                 self._print_formats(existing_formats)
 620                 return
 621             if req_format is None or req_format == 'best':
 622                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 623             elif req_format == 'worst':
 624                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 625             elif req_format in ('-1', 'all'):
 626                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 627             else:
 628                 # Specific formats. We pick the first in a slash-delimeted sequence.
 629                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 630                 req_formats = req_format.split('/')
 631                 video_url_list = None
 632                 for rf in req_formats:
 633                     if rf in url_map:
 634                         video_url_list = [(rf, url_map[rf])]
 635                         break
 636                 if video_url_list is None:
 637                     self._downloader.report_error(u'requested format not available')
 638                     return
 639         else:
 640             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
 641             return
 642
 643         results = []
 644         for format_param, video_real_url in video_url_list:
 645             # Extension
 646             video_extension = self._video_extensions.get(format_param, 'flv')
 647
 648             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 649                                               self._video_dimensions.get(format_param, '???'))
 650
 651             results.append({
 652                 'id':       video_id,
 653                 'url':      video_real_url,
 654                 'uploader': video_uploader,
 655                 'uploader_id': video_uploader_id,
 656                 'upload_date':  upload_date,
 657                 'title':    video_title,
 658                 'ext':      video_extension,
 659                 'format':   video_format,
 660                 'thumbnail':    video_thumbnail,
 661                 'description':  video_description,
 662                 'player_url':   player_url,
 663                 'subtitles':    video_subtitles,
 664                 'duration':     video_duration
 665             })
 666         return results
 667
 668
 669 class MetacafeIE(InfoExtractor):
 670     """Information Extractor for metacafe.com."""
 671
 672     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 673     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 674     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 675     IE_NAME = u'metacafe'
 676
 677     def __init__(self, downloader=None):
 678         InfoExtractor.__init__(self, downloader)
 679
 680     def report_disclaimer(self):
 681         """Report disclaimer retrieval."""
 682         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 683
 684     def report_age_confirmation(self):
 685         """Report attempt to confirm age."""
 686         self._downloader.to_screen(u'[metacafe] Confirming age')
 687
 688     def report_download_webpage(self, video_id):
 689         """Report webpage download."""
 690         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 691
 692     def report_extraction(self, video_id):
 693         """Report information extraction."""
 694         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 695
 696     def _real_initialize(self):
 697         # Retrieve disclaimer
 698         request = compat_urllib_request.Request(self._DISCLAIMER)
 699         try:
 700             self.report_disclaimer()
 701             disclaimer = compat_urllib_request.urlopen(request).read()
 702         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 703             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 704             return
 705
 706         # Confirm age
 707         disclaimer_form = {
 708             'filters': '0',
 709             'submit': "Continue - I'm over 18",
 710             }
 711         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 712         try:
 713             self.report_age_confirmation()
 714             disclaimer = compat_urllib_request.urlopen(request).read()
 715         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 716             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 717             return
 718
 719     def _real_extract(self, url):
 720         # Extract id and simplified title from URL
 721         mobj = re.match(self._VALID_URL, url)
 722         if mobj is None:
 723             self._downloader.report_error(u'invalid URL: %s' % url)
 724             return
 725
 726         video_id = mobj.group(1)
 727
 728         # Check if video comes from YouTube
 729         mobj2 = re.match(r'^yt-(.*)$', video_id)
 730         if mobj2 is not None:
 731             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))]
 732
 733         # Retrieve video webpage to extract further information
 734         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 735         try:
 736             self.report_download_webpage(video_id)
 737             webpage = compat_urllib_request.urlopen(request).read()
 738         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 739             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
 740             return
 741
 742         # Extract URL, uploader and title from webpage
 743         self.report_extraction(video_id)
 744         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 745         if mobj is not None:
 746             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 747             video_extension = mediaURL[-3:]
 748
 749             # Extract gdaKey if available
 750             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 751             if mobj is None:
 752                 video_url = mediaURL
 753             else:
 754                 gdaKey = mobj.group(1)
 755                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 756         else:
 757             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 758             if mobj is None:
 759                 self._downloader.report_error(u'unable to extract media URL')
 760                 return
 761             vardict = compat_parse_qs(mobj.group(1))
 762             if 'mediaData' not in vardict:
 763                 self._downloader.report_error(u'unable to extract media URL')
 764                 return
 765             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 766             if mobj is None:
 767                 self._downloader.report_error(u'unable to extract media URL')
 768                 return
 769             mediaURL = mobj.group(1).replace('\\/', '/')
 770             video_extension = mediaURL[-3:]
 771             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 772
 773         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 774         if mobj is None:
 775             self._downloader.report_error(u'unable to extract title')
 776             return
 777         video_title = mobj.group(1).decode('utf-8')
 778
 779         mobj = re.search(r'submitter=(.*?);', webpage)
 780         if mobj is None:
 781             self._downloader.report_error(u'unable to extract uploader nickname')
 782             return
 783         video_uploader = mobj.group(1)
 784
 785         return [{
 786             'id':       video_id.decode('utf-8'),
 787             'url':      video_url.decode('utf-8'),
 788             'uploader': video_uploader.decode('utf-8'),
 789             'upload_date':  None,
 790             'title':    video_title,
 791             'ext':      video_extension.decode('utf-8'),
 792         }]
 793
 794
 795 class DailymotionIE(InfoExtractor):
 796     """Information Extractor for Dailymotion"""
 797
 798     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 799     IE_NAME = u'dailymotion'
 800     _WORKING = False
 801
 802     def __init__(self, downloader=None):
 803         InfoExtractor.__init__(self, downloader)
 804
 805     def report_extraction(self, video_id):
 806         """Report information extraction."""
 807         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 808
 809     def _real_extract(self, url):
 810         # Extract id and simplified title from URL
 811         mobj = re.match(self._VALID_URL, url)
 812         if mobj is None:
 813             self._downloader.report_error(u'invalid URL: %s' % url)
 814             return
 815
 816         video_id = mobj.group(1).split('_')[0].split('?')[0]
 817
 818         video_extension = 'mp4'
 819
 820         # Retrieve video webpage to extract further information
 821         request = compat_urllib_request.Request(url)
 822         request.add_header('Cookie', 'family_filter=off')
 823         webpage = self._download_webpage(request, video_id)
 824
 825         # Extract URL, uploader and title from webpage
 826         self.report_extraction(video_id)
 827         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 828         if mobj is None:
 829             self._downloader.report_error(u'unable to extract media URL')
 830             return
 831         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 832
 833         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 834             if key in flashvars:
 835                 max_quality = key
 836                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 837                 break
 838         else:
 839             self._downloader.report_error(u'unable to extract video URL')
 840             return
 841
 842         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 843         if mobj is None:
 844             self._downloader.report_error(u'unable to extract video URL')
 845             return
 846
 847         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 848
 849         # TODO: support choosing qualities
 850
 851         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 852         if mobj is None:
 853             self._downloader.report_error(u'unable to extract title')
 854             return
 855         video_title = unescapeHTML(mobj.group('title'))
 856
 857         video_uploader = None
 858         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 859         if mobj is None:
 860             # lookin for official user
 861             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 862             if mobj_official is None:
 863                 self._downloader.report_warning(u'unable to extract uploader nickname')
 864             else:
 865                 video_uploader = mobj_official.group(1)
 866         else:
 867             video_uploader = mobj.group(1)
 868
 869         video_upload_date = None
 870         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 871         if mobj is not None:
 872             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 873
 874         return [{
 875             'id':       video_id,
 876             'url':      video_url,
 877             'uploader': video_uploader,
 878             'upload_date':  video_upload_date,
 879             'title':    video_title,
 880             'ext':      video_extension,
 881         }]
 882
 883
 884 class PhotobucketIE(InfoExtractor):
 885     """Information extractor for photobucket.com."""
 886
 887     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 888     IE_NAME = u'photobucket'
 889
 890     def __init__(self, downloader=None):
 891         InfoExtractor.__init__(self, downloader)
 892
 893     def report_download_webpage(self, video_id):
 894         """Report webpage download."""
 895         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 896
 897     def report_extraction(self, video_id):
 898         """Report information extraction."""
 899         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 900
 901     def _real_extract(self, url):
 902         # Extract id from URL
 903         mobj = re.match(self._VALID_URL, url)
 904         if mobj is None:
 905             self._downloader.report_error(u'Invalid URL: %s' % url)
 906             return
 907
 908         video_id = mobj.group(1)
 909
 910         video_extension = 'flv'
 911
 912         # Retrieve video webpage to extract further information
 913         request = compat_urllib_request.Request(url)
 914         try:
 915             self.report_download_webpage(video_id)
 916             webpage = compat_urllib_request.urlopen(request).read()
 917         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 918             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 919             return
 920
 921         # Extract URL, uploader, and title from webpage
 922         self.report_extraction(video_id)
 923         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 924         if mobj is None:
 925             self._downloader.report_error(u'unable to extract media URL')
 926             return
 927         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 928
 929         video_url = mediaURL
 930
 931         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 932         if mobj is None:
 933             self._downloader.report_error(u'unable to extract title')
 934             return
 935         video_title = mobj.group(1).decode('utf-8')
 936
 937         video_uploader = mobj.group(2).decode('utf-8')
 938
 939         return [{
 940             'id':       video_id.decode('utf-8'),
 941             'url':      video_url.decode('utf-8'),
 942             'uploader': video_uploader,
 943             'upload_date':  None,
 944             'title':    video_title,
 945             'ext':      video_extension.decode('utf-8'),
 946         }]
 947
 948
 949 class YahooIE(InfoExtractor):
 950     """Information extractor for video.yahoo.com."""
 951
 952     _WORKING = False
 953     # _VALID_URL matches all Yahoo! Video URLs
 954     # _VPAGE_URL matches only the extractable '/watch/' URLs
 955     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 956     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 957     IE_NAME = u'video.yahoo'
 958
 959     def __init__(self, downloader=None):
 960         InfoExtractor.__init__(self, downloader)
 961
 962     def report_download_webpage(self, video_id):
 963         """Report webpage download."""
 964         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 965
 966     def report_extraction(self, video_id):
 967         """Report information extraction."""
 968         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 969
 970     def _real_extract(self, url, new_video=True):
 971         # Extract ID from URL
 972         mobj = re.match(self._VALID_URL, url)
 973         if mobj is None:
 974             self._downloader.report_error(u'Invalid URL: %s' % url)
 975             return
 976
 977         video_id = mobj.group(2)
 978         video_extension = 'flv'
 979
 980         # Rewrite valid but non-extractable URLs as
 981         # extractable English language /watch/ URLs
 982         if re.match(self._VPAGE_URL, url) is None:
 983             request = compat_urllib_request.Request(url)
 984             try:
 985                 webpage = compat_urllib_request.urlopen(request).read()
 986             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 987                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 988                 return
 989
 990             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 991             if mobj is None:
 992                 self._downloader.report_error(u'Unable to extract id field')
 993                 return
 994             yahoo_id = mobj.group(1)
 995
 996             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 997             if mobj is None:
 998                 self._downloader.report_error(u'Unable to extract vid field')
 999                 return
1000             yahoo_vid = mobj.group(1)
1001
1002             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1003             return self._real_extract(url, new_video=False)
1004
1005         # Retrieve video webpage to extract further information
1006         request = compat_urllib_request.Request(url)
1007         try:
1008             self.report_download_webpage(video_id)
1009             webpage = compat_urllib_request.urlopen(request).read()
1010         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1011             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1012             return
1013
1014         # Extract uploader and title from webpage
1015         self.report_extraction(video_id)
1016         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1017         if mobj is None:
1018             self._downloader.report_error(u'unable to extract video title')
1019             return
1020         video_title = mobj.group(1).decode('utf-8')
1021
1022         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1023         if mobj is None:
1024             self._downloader.report_error(u'unable to extract video uploader')
1025             return
1026         video_uploader = mobj.group(1).decode('utf-8')
1027
1028         # Extract video thumbnail
1029         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1030         if mobj is None:
1031             self._downloader.report_error(u'unable to extract video thumbnail')
1032             return
1033         video_thumbnail = mobj.group(1).decode('utf-8')
1034
1035         # Extract video description
1036         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1037         if mobj is None:
1038             self._downloader.report_error(u'unable to extract video description')
1039             return
1040         video_description = mobj.group(1).decode('utf-8')
1041         if not video_description:
1042             video_description = 'No description available.'
1043
1044         # Extract video height and width
1045         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1046         if mobj is None:
1047             self._downloader.report_error(u'unable to extract video height')
1048             return
1049         yv_video_height = mobj.group(1)
1050
1051         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1052         if mobj is None:
1053             self._downloader.report_error(u'unable to extract video width')
1054             return
1055         yv_video_width = mobj.group(1)
1056
1057         # Retrieve video playlist to extract media URL
1058         # I'm not completely sure what all these options are, but we
1059         # seem to need most of them, otherwise the server sends a 401.
1060         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1061         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1062         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1063                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1064                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1065         try:
1066             self.report_download_webpage(video_id)
1067             webpage = compat_urllib_request.urlopen(request).read()
1068         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1069             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1070             return
1071
1072         # Extract media URL from playlist XML
1073         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1074         if mobj is None:
1075             self._downloader.report_error(u'Unable to extract media URL')
1076             return
1077         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1078         video_url = unescapeHTML(video_url)
1079
1080         return [{
1081             'id':       video_id.decode('utf-8'),
1082             'url':      video_url,
1083             'uploader': video_uploader,
1084             'upload_date':  None,
1085             'title':    video_title,
1086             'ext':      video_extension.decode('utf-8'),
1087             'thumbnail':    video_thumbnail.decode('utf-8'),
1088             'description':  video_description,
1089         }]
1090
1091
1092 class VimeoIE(InfoExtractor):
1093     """Information extractor for vimeo.com."""
1094
1095     # _VALID_URL matches Vimeo URLs
1096     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1097     IE_NAME = u'vimeo'
1098
1099     def __init__(self, downloader=None):
1100         InfoExtractor.__init__(self, downloader)
1101
1102     def report_download_webpage(self, video_id):
1103         """Report webpage download."""
1104         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1105
1106     def report_extraction(self, video_id):
1107         """Report information extraction."""
1108         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1109
1110     def _real_extract(self, url, new_video=True):
1111         # Extract ID from URL
1112         mobj = re.match(self._VALID_URL, url)
1113         if mobj is None:
1114             self._downloader.report_error(u'Invalid URL: %s' % url)
1115             return
1116
1117         video_id = mobj.group('id')
1118         if not mobj.group('proto'):
1119             url = 'https://' + url
1120         if mobj.group('direct_link'):
1121             url = 'https://vimeo.com/' + video_id
1122
1123         # Retrieve video webpage to extract further information
1124         request = compat_urllib_request.Request(url, None, std_headers)
1125         try:
1126             self.report_download_webpage(video_id)
1127             webpage_bytes = compat_urllib_request.urlopen(request).read()
1128             webpage = webpage_bytes.decode('utf-8')
1129         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1130             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1131             return
1132
1133         # Now we begin extracting as much information as we can from what we
1134         # retrieved. First we extract the information common to all extractors,
1135         # and latter we extract those that are Vimeo specific.
1136         self.report_extraction(video_id)
1137
1138         # Extract the config JSON
1139         try:
1140             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1141             config = json.loads(config)
1142         except:
1143             self._downloader.report_error(u'unable to extract info section')
1144             return
1145
1146         # Extract title
1147         video_title = config["video"]["title"]
1148
1149         # Extract uploader and uploader_id
1150         video_uploader = config["video"]["owner"]["name"]
1151         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1152
1153         # Extract video thumbnail
1154         video_thumbnail = config["video"]["thumbnail"]
1155
1156         # Extract video description
1157         video_description = get_element_by_attribute("itemprop", "description", webpage)
1158         if video_description: video_description = clean_html(video_description)
1159         else: video_description = u''
1160
1161         # Extract upload date
1162         video_upload_date = None
1163         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1164         if mobj is not None:
1165             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1166
1167         # Vimeo specific: extract request signature and timestamp
1168         sig = config['request']['signature']
1169         timestamp = config['request']['timestamp']
1170
1171         # Vimeo specific: extract video codec and quality information
1172         # First consider quality, then codecs, then take everything
1173         # TODO bind to format param
1174         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1175         files = { 'hd': [], 'sd': [], 'other': []}
1176         for codec_name, codec_extension in codecs:
1177             if codec_name in config["video"]["files"]:
1178                 if 'hd' in config["video"]["files"][codec_name]:
1179                     files['hd'].append((codec_name, codec_extension, 'hd'))
1180                 elif 'sd' in config["video"]["files"][codec_name]:
1181                     files['sd'].append((codec_name, codec_extension, 'sd'))
1182                 else:
1183                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1184
1185         for quality in ('hd', 'sd', 'other'):
1186             if len(files[quality]) > 0:
1187                 video_quality = files[quality][0][2]
1188                 video_codec = files[quality][0][0]
1189                 video_extension = files[quality][0][1]
1190                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1191                 break
1192         else:
1193             self._downloader.report_error(u'no known codec found')
1194             return
1195
1196         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1197                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1198
1199         return [{
1200             'id':       video_id,
1201             'url':      video_url,
1202             'uploader': video_uploader,
1203             'uploader_id': video_uploader_id,
1204             'upload_date':  video_upload_date,
1205             'title':    video_title,
1206             'ext':      video_extension,
1207             'thumbnail':    video_thumbnail,
1208             'description':  video_description,
1209         }]
1210
1211
1212 class ArteTvIE(InfoExtractor):
1213     """arte.tv information extractor."""
1214
1215     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1216     _LIVE_URL = r'index-[0-9]+\.html$'
1217
1218     IE_NAME = u'arte.tv'
1219
1220     def __init__(self, downloader=None):
1221         InfoExtractor.__init__(self, downloader)
1222
1223     def report_download_webpage(self, video_id):
1224         """Report webpage download."""
1225         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1226
1227     def report_extraction(self, video_id):
1228         """Report information extraction."""
1229         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1230
1231     def fetch_webpage(self, url):
1232         request = compat_urllib_request.Request(url)
1233         try:
1234             self.report_download_webpage(url)
1235             webpage = compat_urllib_request.urlopen(request).read()
1236         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1237             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1238             return
1239         except ValueError as err:
1240             self._downloader.report_error(u'Invalid URL: %s' % url)
1241             return
1242         return webpage
1243
1244     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1245         page = self.fetch_webpage(url)
1246         mobj = re.search(regex, page, regexFlags)
1247         info = {}
1248
1249         if mobj is None:
1250             self._downloader.report_error(u'Invalid URL: %s' % url)
1251             return
1252
1253         for (i, key, err) in matchTuples:
1254             if mobj.group(i) is None:
1255                 self._downloader.trouble(err)
1256                 return
1257             else:
1258                 info[key] = mobj.group(i)
1259
1260         return info
1261
1262     def extractLiveStream(self, url):
1263         video_lang = url.split('/')[-4]
1264         info = self.grep_webpage(
1265             url,
1266             r'src="(.*?/videothek_js.*?\.js)',
1267             0,
1268             [
1269                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1270             ]
1271         )
1272         http_host = url.split('/')[2]
1273         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1274         info = self.grep_webpage(
1275             next_url,
1276             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1277                 '(http://.*?\.swf).*?' +
1278                 '(rtmp://.*?)\'',
1279             re.DOTALL,
1280             [
1281                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1282                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1283                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1284             ]
1285         )
1286         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1287
1288     def extractPlus7Stream(self, url):
1289         video_lang = url.split('/')[-3]
1290         info = self.grep_webpage(
1291             url,
1292             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1293             0,
1294             [
1295                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1296             ]
1297         )
1298         next_url = compat_urllib_parse.unquote(info.get('url'))
1299         info = self.grep_webpage(
1300             next_url,
1301             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1302             0,
1303             [
1304                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1305             ]
1306         )
1307         next_url = compat_urllib_parse.unquote(info.get('url'))
1308
1309         info = self.grep_webpage(
1310             next_url,
1311             r'<video id="(.*?)".*?>.*?' +
1312                 '<name>(.*?)</name>.*?' +
1313                 '<dateVideo>(.*?)</dateVideo>.*?' +
1314                 '<url quality="hd">(.*?)</url>',
1315             re.DOTALL,
1316             [
1317                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1318                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1319                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1320                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1321             ]
1322         )
1323
1324         return {
1325             'id':           info.get('id'),
1326             'url':          compat_urllib_parse.unquote(info.get('url')),
1327             'uploader':     u'arte.tv',
1328             'upload_date':  info.get('date'),
1329             'title':        info.get('title').decode('utf-8'),
1330             'ext':          u'mp4',
1331             'format':       u'NA',
1332             'player_url':   None,
1333         }
1334
1335     def _real_extract(self, url):
1336         video_id = url.split('/')[-1]
1337         self.report_extraction(video_id)
1338
1339         if re.search(self._LIVE_URL, video_id) is not None:
1340             self.extractLiveStream(url)
1341             return
1342         else:
1343             info = self.extractPlus7Stream(url)
1344
1345         return [info]
1346
1347
1348 class GenericIE(InfoExtractor):
1349     """Generic last-resort information extractor."""
1350
1351     _VALID_URL = r'.*'
1352     IE_NAME = u'generic'
1353
1354     def __init__(self, downloader=None):
1355         InfoExtractor.__init__(self, downloader)
1356
1357     def report_download_webpage(self, video_id):
1358         """Report webpage download."""
1359         if not self._downloader.params.get('test', False):
1360             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1361         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1362
1363     def report_extraction(self, video_id):
1364         """Report information extraction."""
1365         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1366
1367     def report_following_redirect(self, new_url):
1368         """Report information extraction."""
1369         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1370
1371     def _test_redirect(self, url):
1372         """Check if it is a redirect, like url shorteners, in case return the new url."""
1373         class HeadRequest(compat_urllib_request.Request):
1374             def get_method(self):
1375                 return "HEAD"
1376
1377         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1378             """
1379             Subclass the HTTPRedirectHandler to make it use our
1380             HeadRequest also on the redirected URL
1381             """
1382             def redirect_request(self, req, fp, code, msg, headers, newurl):
1383                 if code in (301, 302, 303, 307):
1384                     newurl = newurl.replace(' ', '%20')
1385                     newheaders = dict((k,v) for k,v in req.headers.items()
1386                                       if k.lower() not in ("content-length", "content-type"))
1387                     return HeadRequest(newurl,
1388                                        headers=newheaders,
1389                                        origin_req_host=req.get_origin_req_host(),
1390                                        unverifiable=True)
1391                 else:
1392                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1393
1394         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1395             """
1396             Fallback to GET if HEAD is not allowed (405 HTTP error)
1397             """
1398             def http_error_405(self, req, fp, code, msg, headers):
1399                 fp.read()
1400                 fp.close()
1401
1402                 newheaders = dict((k,v) for k,v in req.headers.items()
1403                                   if k.lower() not in ("content-length", "content-type"))
1404                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1405                                                  headers=newheaders,
1406                                                  origin_req_host=req.get_origin_req_host(),
1407                                                  unverifiable=True))
1408
1409         # Build our opener
1410         opener = compat_urllib_request.OpenerDirector()
1411         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1412                         HTTPMethodFallback, HEADRedirectHandler,
1413                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1414             opener.add_handler(handler())
1415
1416         response = opener.open(HeadRequest(url))
1417         new_url = response.geturl()
1418
1419         if url == new_url:
1420             return False
1421
1422         self.report_following_redirect(new_url)
1423         return new_url
1424
1425     def _real_extract(self, url):
1426         new_url = self._test_redirect(url)
1427         if new_url: return [self.url_result(new_url)]
1428
1429         video_id = url.split('/')[-1]
1430         try:
1431             webpage = self._download_webpage(url, video_id)
1432         except ValueError as err:
1433             # since this is the last-resort InfoExtractor, if
1434             # this error is thrown, it'll be thrown here
1435             self._downloader.report_error(u'Invalid URL: %s' % url)
1436             return
1437
1438         self.report_extraction(video_id)
1439         # Start with something easy: JW Player in SWFObject
1440         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1441         if mobj is None:
1442             # Broaden the search a little bit
1443             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1444         if mobj is None:
1445             # Broaden the search a little bit: JWPlayer JS loader
1446             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1447         if mobj is None:
1448             self._downloader.report_error(u'Invalid URL: %s' % url)
1449             return
1450
1451         # It's possible that one of the regexes
1452         # matched, but returned an empty group:
1453         if mobj.group(1) is None:
1454             self._downloader.report_error(u'Invalid URL: %s' % url)
1455             return
1456
1457         video_url = compat_urllib_parse.unquote(mobj.group(1))
1458         video_id = os.path.basename(video_url)
1459
1460         # here's a fun little line of code for you:
1461         video_extension = os.path.splitext(video_id)[1][1:]
1462         video_id = os.path.splitext(video_id)[0]
1463
1464         # it's tempting to parse this further, but you would
1465         # have to take into account all the variations like
1466         #   Video Title - Site Name
1467         #   Site Name | Video Title
1468         #   Video Title - Tagline | Site Name
1469         # and so on and so forth; it's just not practical
1470         mobj = re.search(r'<title>(.*)</title>', webpage)
1471         if mobj is None:
1472             self._downloader.report_error(u'unable to extract title')
1473             return
1474         video_title = mobj.group(1)
1475
1476         # video uploader is domain name
1477         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1478         if mobj is None:
1479             self._downloader.report_error(u'unable to extract title')
1480             return
1481         video_uploader = mobj.group(1)
1482
1483         return [{
1484             'id':       video_id,
1485             'url':      video_url,
1486             'uploader': video_uploader,
1487             'upload_date':  None,
1488             'title':    video_title,
1489             'ext':      video_extension,
1490         }]
1491
1492
1493 class YoutubeSearchIE(InfoExtractor):
1494     """Information Extractor for YouTube search queries."""
1495     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1496     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1497     _max_youtube_results = 1000
1498     IE_NAME = u'youtube:search'
1499
1500     def __init__(self, downloader=None):
1501         InfoExtractor.__init__(self, downloader)
1502
1503     def report_download_page(self, query, pagenum):
1504         """Report attempt to download search page with given number."""
1505         query = query.decode(preferredencoding())
1506         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1507
1508     def _real_extract(self, query):
1509         mobj = re.match(self._VALID_URL, query)
1510         if mobj is None:
1511             self._downloader.report_error(u'invalid search query "%s"' % query)
1512             return
1513
1514         prefix, query = query.split(':')
1515         prefix = prefix[8:]
1516         query = query.encode('utf-8')
1517         if prefix == '':
1518             self._download_n_results(query, 1)
1519             return
1520         elif prefix == 'all':
1521             self._download_n_results(query, self._max_youtube_results)
1522             return
1523         else:
1524             try:
1525                 n = int(prefix)
1526                 if n <= 0:
1527                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1528                     return
1529                 elif n > self._max_youtube_results:
1530                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1531                     n = self._max_youtube_results
1532                 self._download_n_results(query, n)
1533                 return
1534             except ValueError: # parsing prefix as integer fails
1535                 self._download_n_results(query, 1)
1536                 return
1537
1538     def _download_n_results(self, query, n):
1539         """Downloads a specified number of results for a query"""
1540
1541         video_ids = []
1542         pagenum = 0
1543         limit = n
1544
1545         while (50 * pagenum) < limit:
1546             self.report_download_page(query, pagenum+1)
1547             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1548             request = compat_urllib_request.Request(result_url)
1549             try:
1550                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1551             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1552                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1553                 return
1554             api_response = json.loads(data)['data']
1555
1556             if not 'items' in api_response:
1557                 self._downloader.trouble(u'[youtube] No video results')
1558                 return
1559
1560             new_ids = list(video['id'] for video in api_response['items'])
1561             video_ids += new_ids
1562
1563             limit = min(n, api_response['totalItems'])
1564             pagenum += 1
1565
1566         if len(video_ids) > n:
1567             video_ids = video_ids[:n]
1568         for id in video_ids:
1569             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1570         return
1571
1572
1573 class GoogleSearchIE(InfoExtractor):
1574     """Information Extractor for Google Video search queries."""
1575     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1576     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1577     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1578     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1579     _max_google_results = 1000
1580     IE_NAME = u'video.google:search'
1581
1582     def __init__(self, downloader=None):
1583         InfoExtractor.__init__(self, downloader)
1584
1585     def report_download_page(self, query, pagenum):
1586         """Report attempt to download playlist page with given number."""
1587         query = query.decode(preferredencoding())
1588         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1589
1590     def _real_extract(self, query):
1591         mobj = re.match(self._VALID_URL, query)
1592         if mobj is None:
1593             self._downloader.report_error(u'invalid search query "%s"' % query)
1594             return
1595
1596         prefix, query = query.split(':')
1597         prefix = prefix[8:]
1598         query = query.encode('utf-8')
1599         if prefix == '':
1600             self._download_n_results(query, 1)
1601             return
1602         elif prefix == 'all':
1603             self._download_n_results(query, self._max_google_results)
1604             return
1605         else:
1606             try:
1607                 n = int(prefix)
1608                 if n <= 0:
1609                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1610                     return
1611                 elif n > self._max_google_results:
1612                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1613                     n = self._max_google_results
1614                 self._download_n_results(query, n)
1615                 return
1616             except ValueError: # parsing prefix as integer fails
1617                 self._download_n_results(query, 1)
1618                 return
1619
1620     def _download_n_results(self, query, n):
1621         """Downloads a specified number of results for a query"""
1622
1623         video_ids = []
1624         pagenum = 0
1625
1626         while True:
1627             self.report_download_page(query, pagenum)
1628             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1629             request = compat_urllib_request.Request(result_url)
1630             try:
1631                 page = compat_urllib_request.urlopen(request).read()
1632             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1633                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1634                 return
1635
1636             # Extract video identifiers
1637             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1638                 video_id = mobj.group(1)
1639                 if video_id not in video_ids:
1640                     video_ids.append(video_id)
1641                     if len(video_ids) == n:
1642                         # Specified n videos reached
1643                         for id in video_ids:
1644                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1645                         return
1646
1647             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1648                 for id in video_ids:
1649                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1650                 return
1651
1652             pagenum = pagenum + 1
1653
1654
1655 class YahooSearchIE(InfoExtractor):
1656     """Information Extractor for Yahoo! Video search queries."""
1657
1658     _WORKING = False
1659     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1660     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1661     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1662     _MORE_PAGES_INDICATOR = r'\s*Next'
1663     _max_yahoo_results = 1000
1664     IE_NAME = u'video.yahoo:search'
1665
1666     def __init__(self, downloader=None):
1667         InfoExtractor.__init__(self, downloader)
1668
1669     def report_download_page(self, query, pagenum):
1670         """Report attempt to download playlist page with given number."""
1671         query = query.decode(preferredencoding())
1672         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1673
1674     def _real_extract(self, query):
1675         mobj = re.match(self._VALID_URL, query)
1676         if mobj is None:
1677             self._downloader.report_error(u'invalid search query "%s"' % query)
1678             return
1679
1680         prefix, query = query.split(':')
1681         prefix = prefix[8:]
1682         query = query.encode('utf-8')
1683         if prefix == '':
1684             self._download_n_results(query, 1)
1685             return
1686         elif prefix == 'all':
1687             self._download_n_results(query, self._max_yahoo_results)
1688             return
1689         else:
1690             try:
1691                 n = int(prefix)
1692                 if n <= 0:
1693                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1694                     return
1695                 elif n > self._max_yahoo_results:
1696                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1697                     n = self._max_yahoo_results
1698                 self._download_n_results(query, n)
1699                 return
1700             except ValueError: # parsing prefix as integer fails
1701                 self._download_n_results(query, 1)
1702                 return
1703
1704     def _download_n_results(self, query, n):
1705         """Downloads a specified number of results for a query"""
1706
1707         video_ids = []
1708         already_seen = set()
1709         pagenum = 1
1710
1711         while True:
1712             self.report_download_page(query, pagenum)
1713             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1714             request = compat_urllib_request.Request(result_url)
1715             try:
1716                 page = compat_urllib_request.urlopen(request).read()
1717             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1718                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1719                 return
1720
1721             # Extract video identifiers
1722             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1723                 video_id = mobj.group(1)
1724                 if video_id not in already_seen:
1725                     video_ids.append(video_id)
1726                     already_seen.add(video_id)
1727                     if len(video_ids) == n:
1728                         # Specified n videos reached
1729                         for id in video_ids:
1730                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1731                         return
1732
1733             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1734                 for id in video_ids:
1735                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1736                 return
1737
1738             pagenum = pagenum + 1
1739
1740
1741 class YoutubePlaylistIE(InfoExtractor):
1742     """Information Extractor for YouTube playlists."""
1743
1744     _VALID_URL = r"""(?:
1745                         (?:https?://)?
1746                         (?:\w+\.)?
1747                         youtube\.com/
1748                         (?:
1749                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1750                            \? (?:.*?&)*? (?:p|a|list)=
1751                         |  p/
1752                         )
1753                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1754                         .*
1755                      |
1756                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1757                      )"""
1758     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1759     _MAX_RESULTS = 50
1760     IE_NAME = u'youtube:playlist'
1761
1762     def __init__(self, downloader=None):
1763         InfoExtractor.__init__(self, downloader)
1764
1765     @classmethod
1766     def suitable(cls, url):
1767         """Receives a URL and returns True if suitable for this IE."""
1768         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1769
1770     def report_download_page(self, playlist_id, pagenum):
1771         """Report attempt to download playlist page with given number."""
1772         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1773
1774     def _real_extract(self, url):
1775         # Extract playlist id
1776         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1777         if mobj is None:
1778             self._downloader.report_error(u'invalid url: %s' % url)
1779             return
1780
1781         # Download playlist videos from API
1782         playlist_id = mobj.group(1) or mobj.group(2)
1783         page_num = 1
1784         videos = []
1785
1786         while True:
1787             self.report_download_page(playlist_id, page_num)
1788
1789             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1790             try:
1791                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1792             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1793                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1794                 return
1795
1796             try:
1797                 response = json.loads(page)
1798             except ValueError as err:
1799                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1800                 return
1801
1802             if 'feed' not in response:
1803                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1804                 return
1805             if 'entry' not in response['feed']:
1806                 # Number of videos is a multiple of self._MAX_RESULTS
1807                 break
1808
1809             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1810                         for entry in response['feed']['entry']
1811                         if 'content' in entry ]
1812
1813             if len(response['feed']['entry']) < self._MAX_RESULTS:
1814                 break
1815             page_num += 1
1816
1817         videos = [v[1] for v in sorted(videos)]
1818
1819         url_results = [self.url_result(url) for url in videos]
1820         return [self.playlist_result(url_results, playlist_id)]
1821
1822
1823 class YoutubeChannelIE(InfoExtractor):
1824     """Information Extractor for YouTube channels."""
1825
1826     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1827     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1828     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1829     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1830     IE_NAME = u'youtube:channel'
1831
1832     def report_download_page(self, channel_id, pagenum):
1833         """Report attempt to download channel page with given number."""
1834         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1835
1836     def extract_videos_from_page(self, page):
1837         ids_in_page = []
1838         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1839             if mobj.group(1) not in ids_in_page:
1840                 ids_in_page.append(mobj.group(1))
1841         return ids_in_page
1842
1843     def _real_extract(self, url):
1844         # Extract channel id
1845         mobj = re.match(self._VALID_URL, url)
1846         if mobj is None:
1847             self._downloader.report_error(u'invalid url: %s' % url)
1848             return
1849
1850         # Download channel page
1851         channel_id = mobj.group(1)
1852         video_ids = []
1853         pagenum = 1
1854
1855         self.report_download_page(channel_id, pagenum)
1856         url = self._TEMPLATE_URL % (channel_id, pagenum)
1857         request = compat_urllib_request.Request(url)
1858         try:
1859             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1860         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1861             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1862             return
1863
1864         # Extract video identifiers
1865         ids_in_page = self.extract_videos_from_page(page)
1866         video_ids.extend(ids_in_page)
1867
1868         # Download any subsequent channel pages using the json-based channel_ajax query
1869         if self._MORE_PAGES_INDICATOR in page:
1870             while True:
1871                 pagenum = pagenum + 1
1872
1873                 self.report_download_page(channel_id, pagenum)
1874                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1875                 request = compat_urllib_request.Request(url)
1876                 try:
1877                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1878                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1879                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1880                     return
1881
1882                 page = json.loads(page)
1883
1884                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1885                 video_ids.extend(ids_in_page)
1886
1887                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1888                     break
1889
1890         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1891
1892         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1893         url_entries = [self.url_result(url) for url in urls]
1894         return [self.playlist_result(url_entries, channel_id)]
1895
1896
1897 class YoutubeUserIE(InfoExtractor):
1898     """Information Extractor for YouTube users."""
1899
1900     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1901     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1902     _GDATA_PAGE_SIZE = 50
1903     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1904     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1905     IE_NAME = u'youtube:user'
1906
1907     def __init__(self, downloader=None):
1908         InfoExtractor.__init__(self, downloader)
1909
1910     def report_download_page(self, username, start_index):
1911         """Report attempt to download user page."""
1912         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1913                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1914
1915     def _real_extract(self, url):
1916         # Extract username
1917         mobj = re.match(self._VALID_URL, url)
1918         if mobj is None:
1919             self._downloader.report_error(u'invalid url: %s' % url)
1920             return
1921
1922         username = mobj.group(1)
1923
1924         # Download video ids using YouTube Data API. Result size per
1925         # query is limited (currently to 50 videos) so we need to query
1926         # page by page until there are no video ids - it means we got
1927         # all of them.
1928
1929         video_ids = []
1930         pagenum = 0
1931
1932         while True:
1933             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1934             self.report_download_page(username, start_index)
1935
1936             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1937
1938             try:
1939                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1940             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1941                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1942                 return
1943
1944             # Extract video identifiers
1945             ids_in_page = []
1946
1947             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1948                 if mobj.group(1) not in ids_in_page:
1949                     ids_in_page.append(mobj.group(1))
1950
1951             video_ids.extend(ids_in_page)
1952
1953             # A little optimization - if current page is not
1954             # "full", ie. does not contain PAGE_SIZE video ids then
1955             # we can assume that this page is the last one - there
1956             # are no more ids on further pages - no need to query
1957             # again.
1958
1959             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1960                 break
1961
1962             pagenum += 1
1963
1964         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1965         url_results = [self.url_result(url) for url in urls]
1966         return [self.playlist_result(url_results, playlist_title = username)]
1967
1968
1969 class BlipTVUserIE(InfoExtractor):
1970     """Information Extractor for blip.tv users."""
1971
1972     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1973     _PAGE_SIZE = 12
1974     IE_NAME = u'blip.tv:user'
1975
1976     def __init__(self, downloader=None):
1977         InfoExtractor.__init__(self, downloader)
1978
1979     def report_download_page(self, username, pagenum):
1980         """Report attempt to download user page."""
1981         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1982                 (self.IE_NAME, username, pagenum))
1983
1984     def _real_extract(self, url):
1985         # Extract username
1986         mobj = re.match(self._VALID_URL, url)
1987         if mobj is None:
1988             self._downloader.report_error(u'invalid url: %s' % url)
1989             return
1990
1991         username = mobj.group(1)
1992
1993         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1994
1995         request = compat_urllib_request.Request(url)
1996
1997         try:
1998             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1999             mobj = re.search(r'data-users-id="([^"]+)"', page)
2000             page_base = page_base % mobj.group(1)
2001         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2002             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2003             return
2004
2005
2006         # Download video ids using BlipTV Ajax calls. Result size per
2007         # query is limited (currently to 12 videos) so we need to query
2008         # page by page until there are no video ids - it means we got
2009         # all of them.
2010
2011         video_ids = []
2012         pagenum = 1
2013
2014         while True:
2015             self.report_download_page(username, pagenum)
2016             url = page_base + "&page=" + str(pagenum)
2017             request = compat_urllib_request.Request( url )
2018             try:
2019                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2020             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2021                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2022                 return
2023
2024             # Extract video identifiers
2025             ids_in_page = []
2026
2027             for mobj in re.finditer(r'href="/([^"]+)"', page):
2028                 if mobj.group(1) not in ids_in_page:
2029                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2030
2031             video_ids.extend(ids_in_page)
2032
2033             # A little optimization - if current page is not
2034             # "full", ie. does not contain PAGE_SIZE video ids then
2035             # we can assume that this page is the last one - there
2036             # are no more ids on further pages - no need to query
2037             # again.
2038
2039             if len(ids_in_page) < self._PAGE_SIZE:
2040                 break
2041
2042             pagenum += 1
2043
2044         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2045                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2046
2047         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2048         url_entries = [self.url_result(url) for url in urls]
2049         return [self.playlist_result(url_entries, playlist_title = username)]
2050
2051
2052 class DepositFilesIE(InfoExtractor):
2053     """Information extractor for depositfiles.com"""
2054
2055     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2056
2057     def report_download_webpage(self, file_id):
2058         """Report webpage download."""
2059         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2060
2061     def report_extraction(self, file_id):
2062         """Report information extraction."""
2063         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2064
2065     def _real_extract(self, url):
2066         file_id = url.split('/')[-1]
2067         # Rebuild url in english locale
2068         url = 'http://depositfiles.com/en/files/' + file_id
2069
2070         # Retrieve file webpage with 'Free download' button pressed
2071         free_download_indication = { 'gateway_result' : '1' }
2072         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2073         try:
2074             self.report_download_webpage(file_id)
2075             webpage = compat_urllib_request.urlopen(request).read()
2076         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2077             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2078             return
2079
2080         # Search for the real file URL
2081         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2082         if (mobj is None) or (mobj.group(1) is None):
2083             # Try to figure out reason of the error.
2084             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2085             if (mobj is not None) and (mobj.group(1) is not None):
2086                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2087                 self._downloader.report_error(u'%s' % restriction_message)
2088             else:
2089                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2090             return
2091
2092         file_url = mobj.group(1)
2093         file_extension = os.path.splitext(file_url)[1][1:]
2094
2095         # Search for file title
2096         mobj = re.search(r'<b title="(.*?)">', webpage)
2097         if mobj is None:
2098             self._downloader.report_error(u'unable to extract title')
2099             return
2100         file_title = mobj.group(1).decode('utf-8')
2101
2102         return [{
2103             'id':       file_id.decode('utf-8'),
2104             'url':      file_url.decode('utf-8'),
2105             'uploader': None,
2106             'upload_date':  None,
2107             'title':    file_title,
2108             'ext':      file_extension.decode('utf-8'),
2109         }]
2110
2111
2112 class FacebookIE(InfoExtractor):
2113     """Information Extractor for Facebook"""
2114
2115     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2116     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2117     _NETRC_MACHINE = 'facebook'
2118     IE_NAME = u'facebook'
2119
2120     def report_login(self):
2121         """Report attempt to log in."""
2122         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2123
2124     def _real_initialize(self):
2125         if self._downloader is None:
2126             return
2127
2128         useremail = None
2129         password = None
2130         downloader_params = self._downloader.params
2131
2132         # Attempt to use provided username and password or .netrc data
2133         if downloader_params.get('username', None) is not None:
2134             useremail = downloader_params['username']
2135             password = downloader_params['password']
2136         elif downloader_params.get('usenetrc', False):
2137             try:
2138                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2139                 if info is not None:
2140                     useremail = info[0]
2141                     password = info[2]
2142                 else:
2143                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2144             except (IOError, netrc.NetrcParseError) as err:
2145                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2146                 return
2147
2148         if useremail is None:
2149             return
2150
2151         # Log in
2152         login_form = {
2153             'email': useremail,
2154             'pass': password,
2155             'login': 'Log+In'
2156             }
2157         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2158         try:
2159             self.report_login()
2160             login_results = compat_urllib_request.urlopen(request).read()
2161             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2162                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2163                 return
2164         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2165             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2166             return
2167
2168     def _real_extract(self, url):
2169         mobj = re.match(self._VALID_URL, url)
2170         if mobj is None:
2171             self._downloader.report_error(u'invalid URL: %s' % url)
2172             return
2173         video_id = mobj.group('ID')
2174
2175         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2176         webpage = self._download_webpage(url, video_id)
2177
2178         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2179         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2180         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2181         if not m:
2182             raise ExtractorError(u'Cannot parse data')
2183         data = dict(json.loads(m.group(1)))
2184         params_raw = compat_urllib_parse.unquote(data['params'])
2185         params = json.loads(params_raw)
2186         video_data = params['video_data'][0]
2187         video_url = video_data.get('hd_src')
2188         if not video_url:
2189             video_url = video_data['sd_src']
2190         if not video_url:
2191             raise ExtractorError(u'Cannot find video URL')
2192         video_duration = int(video_data['video_duration'])
2193         thumbnail = video_data['thumbnail_src']
2194
2195         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2196         if not m:
2197             raise ExtractorError(u'Cannot find title in webpage')
2198         video_title = unescapeHTML(m.group(1))
2199
2200         info = {
2201             'id': video_id,
2202             'title': video_title,
2203             'url': video_url,
2204             'ext': 'mp4',
2205             'duration': video_duration,
2206             'thumbnail': thumbnail,
2207         }
2208         return [info]
2209
2210
2211 class BlipTVIE(InfoExtractor):
2212     """Information extractor for blip.tv"""
2213
2214     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2215     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2216     IE_NAME = u'blip.tv'
2217
2218     def report_extraction(self, file_id):
2219         """Report information extraction."""
2220         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2221
2222     def report_direct_download(self, title):
2223         """Report information extraction."""
2224         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2225
2226     def _real_extract(self, url):
2227         mobj = re.match(self._VALID_URL, url)
2228         if mobj is None:
2229             self._downloader.report_error(u'invalid URL: %s' % url)
2230             return
2231
2232         urlp = compat_urllib_parse_urlparse(url)
2233         if urlp.path.startswith('/play/'):
2234             request = compat_urllib_request.Request(url)
2235             response = compat_urllib_request.urlopen(request)
2236             redirecturl = response.geturl()
2237             rurlp = compat_urllib_parse_urlparse(redirecturl)
2238             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2239             url = 'http://blip.tv/a/a-' + file_id
2240             return self._real_extract(url)
2241
2242
2243         if '?' in url:
2244             cchar = '&'
2245         else:
2246             cchar = '?'
2247         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2248         request = compat_urllib_request.Request(json_url)
2249         request.add_header('User-Agent', 'iTunes/10.6.1')
2250         self.report_extraction(mobj.group(1))
2251         info = None
2252         try:
2253             urlh = compat_urllib_request.urlopen(request)
2254             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2255                 basename = url.split('/')[-1]
2256                 title,ext = os.path.splitext(basename)
2257                 title = title.decode('UTF-8')
2258                 ext = ext.replace('.', '')
2259                 self.report_direct_download(title)
2260                 info = {
2261                     'id': title,
2262                     'url': url,
2263                     'uploader': None,
2264                     'upload_date': None,
2265                     'title': title,
2266                     'ext': ext,
2267                     'urlhandle': urlh
2268                 }
2269         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2270             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2271         if info is None: # Regular URL
2272             try:
2273                 json_code_bytes = urlh.read()
2274                 json_code = json_code_bytes.decode('utf-8')
2275             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2276                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2277                 return
2278
2279             try:
2280                 json_data = json.loads(json_code)
2281                 if 'Post' in json_data:
2282                     data = json_data['Post']
2283                 else:
2284                     data = json_data
2285
2286                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2287                 video_url = data['media']['url']
2288                 umobj = re.match(self._URL_EXT, video_url)
2289                 if umobj is None:
2290                     raise ValueError('Can not determine filename extension')
2291                 ext = umobj.group(1)
2292
2293                 info = {
2294                     'id': data['item_id'],
2295                     'url': video_url,
2296                     'uploader': data['display_name'],
2297                     'upload_date': upload_date,
2298                     'title': data['title'],
2299                     'ext': ext,
2300                     'format': data['media']['mimeType'],
2301                     'thumbnail': data['thumbnailUrl'],
2302                     'description': data['description'],
2303                     'player_url': data['embedUrl'],
2304                     'user_agent': 'iTunes/10.6.1',
2305                 }
2306             except (ValueError,KeyError) as err:
2307                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2308                 return
2309
2310         return [info]
2311
2312
2313 class MyVideoIE(InfoExtractor):
2314     """Information Extractor for myvideo.de."""
2315
2316     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2317     IE_NAME = u'myvideo'
2318
2319     def __init__(self, downloader=None):
2320         InfoExtractor.__init__(self, downloader)
2321
2322     def report_extraction(self, video_id):
2323         """Report information extraction."""
2324         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2325
2326     def _real_extract(self,url):
2327         mobj = re.match(self._VALID_URL, url)
2328         if mobj is None:
2329             self._download.report_error(u'invalid URL: %s' % url)
2330             return
2331
2332         video_id = mobj.group(1)
2333
2334         # Get video webpage
2335         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2336         webpage = self._download_webpage(webpage_url, video_id)
2337
2338         self.report_extraction(video_id)
2339         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2340                  webpage)
2341         if mobj is None:
2342             self._downloader.report_error(u'unable to extract media URL')
2343             return
2344         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2345
2346         mobj = re.search('<title>([^<]+)</title>', webpage)
2347         if mobj is None:
2348             self._downloader.report_error(u'unable to extract title')
2349             return
2350
2351         video_title = mobj.group(1)
2352
2353         return [{
2354             'id':       video_id,
2355             'url':      video_url,
2356             'uploader': None,
2357             'upload_date':  None,
2358             'title':    video_title,
2359             'ext':      u'flv',
2360         }]
2361
2362 class ComedyCentralIE(InfoExtractor):
2363     """Information extractor for The Daily Show and Colbert Report """
2364
2365     # urls can be abbreviations like :thedailyshow or :colbert
2366     # urls for episodes like:
2367     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2368     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2369     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2370     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2371                       |(https?://)?(www\.)?
2372                           (?P<showname>thedailyshow|colbertnation)\.com/
2373                          (full-episodes/(?P<episode>.*)|
2374                           (?P<clip>
2375                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2376                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2377                      $"""
2378
2379     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2380
2381     _video_extensions = {
2382         '3500': 'mp4',
2383         '2200': 'mp4',
2384         '1700': 'mp4',
2385         '1200': 'mp4',
2386         '750': 'mp4',
2387         '400': 'mp4',
2388     }
2389     _video_dimensions = {
2390         '3500': '1280x720',
2391         '2200': '960x540',
2392         '1700': '768x432',
2393         '1200': '640x360',
2394         '750': '512x288',
2395         '400': '384x216',
2396     }
2397
2398     @classmethod
2399     def suitable(cls, url):
2400         """Receives a URL and returns True if suitable for this IE."""
2401         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2402
2403     def report_extraction(self, episode_id):
2404         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2405
2406     def report_config_download(self, episode_id, media_id):
2407         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2408
2409     def report_index_download(self, episode_id):
2410         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2411
2412     def _print_formats(self, formats):
2413         print('Available formats:')
2414         for x in formats:
2415             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2416
2417
2418     def _real_extract(self, url):
2419         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2420         if mobj is None:
2421             self._downloader.report_error(u'invalid URL: %s' % url)
2422             return
2423
2424         if mobj.group('shortname'):
2425             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2426                 url = u'http://www.thedailyshow.com/full-episodes/'
2427             else:
2428                 url = u'http://www.colbertnation.com/full-episodes/'
2429             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2430             assert mobj is not None
2431
2432         if mobj.group('clip'):
2433             if mobj.group('showname') == 'thedailyshow':
2434                 epTitle = mobj.group('tdstitle')
2435             else:
2436                 epTitle = mobj.group('cntitle')
2437             dlNewest = False
2438         else:
2439             dlNewest = not mobj.group('episode')
2440             if dlNewest:
2441                 epTitle = mobj.group('showname')
2442             else:
2443                 epTitle = mobj.group('episode')
2444
2445         req = compat_urllib_request.Request(url)
2446         self.report_extraction(epTitle)
2447         try:
2448             htmlHandle = compat_urllib_request.urlopen(req)
2449             html = htmlHandle.read()
2450             webpage = html.decode('utf-8')
2451         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2452             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2453             return
2454         if dlNewest:
2455             url = htmlHandle.geturl()
2456             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2457             if mobj is None:
2458                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2459                 return
2460             if mobj.group('episode') == '':
2461                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2462                 return
2463             epTitle = mobj.group('episode')
2464
2465         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2466
2467         if len(mMovieParams) == 0:
2468             # The Colbert Report embeds the information in a without
2469             # a URL prefix; so extract the alternate reference
2470             # and then add the URL prefix manually.
2471
2472             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2473             if len(altMovieParams) == 0:
2474                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2475                 return
2476             else:
2477                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2478
2479         uri = mMovieParams[0][1]
2480         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2481         self.report_index_download(epTitle)
2482         try:
2483             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2484         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2486             return
2487
2488         results = []
2489
2490         idoc = xml.etree.ElementTree.fromstring(indexXml)
2491         itemEls = idoc.findall('.//item')
2492         for partNum,itemEl in enumerate(itemEls):
2493             mediaId = itemEl.findall('./guid')[0].text
2494             shortMediaId = mediaId.split(':')[-1]
2495             showId = mediaId.split(':')[-2].replace('.com', '')
2496             officialTitle = itemEl.findall('./title')[0].text
2497             officialDate = itemEl.findall('./pubDate')[0].text
2498
2499             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2500                         compat_urllib_parse.urlencode({'uri': mediaId}))
2501             configReq = compat_urllib_request.Request(configUrl)
2502             self.report_config_download(epTitle, shortMediaId)
2503             try:
2504                 configXml = compat_urllib_request.urlopen(configReq).read()
2505             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2506                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2507                 return
2508
2509             cdoc = xml.etree.ElementTree.fromstring(configXml)
2510             turls = []
2511             for rendition in cdoc.findall('.//rendition'):
2512                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2513                 turls.append(finfo)
2514
2515             if len(turls) == 0:
2516                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2517                 continue
2518
2519             if self._downloader.params.get('listformats', None):
2520                 self._print_formats([i[0] for i in turls])
2521                 return
2522
2523             # For now, just pick the highest bitrate
2524             format,rtmp_video_url = turls[-1]
2525
2526             # Get the format arg from the arg stream
2527             req_format = self._downloader.params.get('format', None)
2528
2529             # Select format if we can find one
2530             for f,v in turls:
2531                 if f == req_format:
2532                     format, rtmp_video_url = f, v
2533                     break
2534
2535             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2536             if not m:
2537                 raise ExtractorError(u'Cannot transform RTMP url')
2538             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2539             video_url = base + m.group('finalid')
2540
2541             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2542             info = {
2543                 'id': shortMediaId,
2544                 'url': video_url,
2545                 'uploader': showId,
2546                 'upload_date': officialDate,
2547                 'title': effTitle,
2548                 'ext': 'mp4',
2549                 'format': format,
2550                 'thumbnail': None,
2551                 'description': officialTitle,
2552             }
2553             results.append(info)
2554
2555         return results
2556
2557
2558 class EscapistIE(InfoExtractor):
2559     """Information extractor for The Escapist """
2560
2561     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2562     IE_NAME = u'escapist'
2563
2564     def report_extraction(self, showName):
2565         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2566
2567     def report_config_download(self, showName):
2568         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2569
2570     def _real_extract(self, url):
2571         mobj = re.match(self._VALID_URL, url)
2572         if mobj is None:
2573             self._downloader.report_error(u'invalid URL: %s' % url)
2574             return
2575         showName = mobj.group('showname')
2576         videoId = mobj.group('episode')
2577
2578         self.report_extraction(showName)
2579         try:
2580             webPage = compat_urllib_request.urlopen(url)
2581             webPageBytes = webPage.read()
2582             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2583             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2584         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2585             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2586             return
2587
2588         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2589         description = unescapeHTML(descMatch.group(1))
2590         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2591         imgUrl = unescapeHTML(imgMatch.group(1))
2592         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2593         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2594         configUrlMatch = re.search('config=(.*)$', playerUrl)
2595         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2596
2597         self.report_config_download(showName)
2598         try:
2599             configJSON = compat_urllib_request.urlopen(configUrl)
2600             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2601             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2602         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2603             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2604             return
2605
2606         # Technically, it's JavaScript, not JSON
2607         configJSON = configJSON.replace("'", '"')
2608
2609         try:
2610             config = json.loads(configJSON)
2611         except (ValueError,) as err:
2612             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2613             return
2614
2615         playlist = config['playlist']
2616         videoUrl = playlist[1]['url']
2617
2618         info = {
2619             'id': videoId,
2620             'url': videoUrl,
2621             'uploader': showName,
2622             'upload_date': None,
2623             'title': showName,
2624             'ext': 'mp4',
2625             'thumbnail': imgUrl,
2626             'description': description,
2627             'player_url': playerUrl,
2628         }
2629
2630         return [info]
2631
2632 class CollegeHumorIE(InfoExtractor):
2633     """Information extractor for collegehumor.com"""
2634
2635     _WORKING = False
2636     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2637     IE_NAME = u'collegehumor'
2638
2639     def report_manifest(self, video_id):
2640         """Report information extraction."""
2641         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2642
2643     def report_extraction(self, video_id):
2644         """Report information extraction."""
2645         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2646
2647     def _real_extract(self, url):
2648         mobj = re.match(self._VALID_URL, url)
2649         if mobj is None:
2650             self._downloader.report_error(u'invalid URL: %s' % url)
2651             return
2652         video_id = mobj.group('videoid')
2653
2654         info = {
2655             'id': video_id,
2656             'uploader': None,
2657             'upload_date': None,
2658         }
2659
2660         self.report_extraction(video_id)
2661         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2662         try:
2663             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2664         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2665             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2666             return
2667
2668         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2669         try:
2670             videoNode = mdoc.findall('./video')[0]
2671             info['description'] = videoNode.findall('./description')[0].text
2672             info['title'] = videoNode.findall('./caption')[0].text
2673             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2674             manifest_url = videoNode.findall('./file')[0].text
2675         except IndexError:
2676             self._downloader.report_error(u'Invalid metadata XML file')
2677             return
2678
2679         manifest_url += '?hdcore=2.10.3'
2680         self.report_manifest(video_id)
2681         try:
2682             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2683         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2684             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2685             return
2686
2687         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2688         try:
2689             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2690             node_id = media_node.attrib['url']
2691             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2692         except IndexError as err:
2693             self._downloader.report_error(u'Invalid manifest file')
2694             return
2695
2696         url_pr = compat_urllib_parse_urlparse(manifest_url)
2697         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2698
2699         info['url'] = url
2700         info['ext'] = 'f4f'
2701         return [info]
2702
2703
2704 class XVideosIE(InfoExtractor):
2705     """Information extractor for xvideos.com"""
2706
2707     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2708     IE_NAME = u'xvideos'
2709
2710     def report_extraction(self, video_id):
2711         """Report information extraction."""
2712         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2713
2714     def _real_extract(self, url):
2715         mobj = re.match(self._VALID_URL, url)
2716         if mobj is None:
2717             self._downloader.report_error(u'invalid URL: %s' % url)
2718             return
2719         video_id = mobj.group(1)
2720
2721         webpage = self._download_webpage(url, video_id)
2722
2723         self.report_extraction(video_id)
2724
2725
2726         # Extract video URL
2727         mobj = re.search(r'flv_url=(.+?)&', webpage)
2728         if mobj is None:
2729             self._downloader.report_error(u'unable to extract video url')
2730             return
2731         video_url = compat_urllib_parse.unquote(mobj.group(1))
2732
2733
2734         # Extract title
2735         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2736         if mobj is None:
2737             self._downloader.report_error(u'unable to extract video title')
2738             return
2739         video_title = mobj.group(1)
2740
2741
2742         # Extract video thumbnail
2743         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2744         if mobj is None:
2745             self._downloader.report_error(u'unable to extract video thumbnail')
2746             return
2747         video_thumbnail = mobj.group(0)
2748
2749         info = {
2750             'id': video_id,
2751             'url': video_url,
2752             'uploader': None,
2753             'upload_date': None,
2754             'title': video_title,
2755             'ext': 'flv',
2756             'thumbnail': video_thumbnail,
2757             'description': None,
2758         }
2759
2760         return [info]
2761
2762
2763 class SoundcloudIE(InfoExtractor):
2764     """Information extractor for soundcloud.com
2765        To access the media, the uid of the song and a stream token
2766        must be extracted from the page source and the script must make
2767        a request to media.soundcloud.com/crossdomain.xml. Then
2768        the media can be grabbed by requesting from an url composed
2769        of the stream token and uid
2770      """
2771
2772     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2773     IE_NAME = u'soundcloud'
2774
2775     def __init__(self, downloader=None):
2776         InfoExtractor.__init__(self, downloader)
2777
2778     def report_resolve(self, video_id):
2779         """Report information extraction."""
2780         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2781
2782     def report_extraction(self, video_id):
2783         """Report information extraction."""
2784         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2785
2786     def _real_extract(self, url):
2787         mobj = re.match(self._VALID_URL, url)
2788         if mobj is None:
2789             self._downloader.report_error(u'invalid URL: %s' % url)
2790             return
2791
2792         # extract uploader (which is in the url)
2793         uploader = mobj.group(1)
2794         # extract simple title (uploader + slug of song title)
2795         slug_title =  mobj.group(2)
2796         simple_title = uploader + u'-' + slug_title
2797
2798         self.report_resolve('%s/%s' % (uploader, slug_title))
2799
2800         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2801         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2802         request = compat_urllib_request.Request(resolv_url)
2803         try:
2804             info_json_bytes = compat_urllib_request.urlopen(request).read()
2805             info_json = info_json_bytes.decode('utf-8')
2806         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2807             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2808             return
2809
2810         info = json.loads(info_json)
2811         video_id = info['id']
2812         self.report_extraction('%s/%s' % (uploader, slug_title))
2813
2814         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2815         request = compat_urllib_request.Request(streams_url)
2816         try:
2817             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2818             stream_json = stream_json_bytes.decode('utf-8')
2819         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2820             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2821             return
2822
2823         streams = json.loads(stream_json)
2824         mediaURL = streams['http_mp3_128_url']
2825
2826         return [{
2827             'id':       info['id'],
2828             'url':      mediaURL,
2829             'uploader': info['user']['username'],
2830             'upload_date':  info['created_at'],
2831             'title':    info['title'],
2832             'ext':      u'mp3',
2833             'description': info['description'],
2834         }]
2835
2836 class SoundcloudSetIE(InfoExtractor):
2837     """Information extractor for soundcloud.com sets
2838        To access the media, the uid of the song and a stream token
2839        must be extracted from the page source and the script must make
2840        a request to media.soundcloud.com/crossdomain.xml. Then
2841        the media can be grabbed by requesting from an url composed
2842        of the stream token and uid
2843      """
2844
2845     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2846     IE_NAME = u'soundcloud'
2847
2848     def __init__(self, downloader=None):
2849         InfoExtractor.__init__(self, downloader)
2850
2851     def report_resolve(self, video_id):
2852         """Report information extraction."""
2853         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2854
2855     def report_extraction(self, video_id):
2856         """Report information extraction."""
2857         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2858
2859     def _real_extract(self, url):
2860         mobj = re.match(self._VALID_URL, url)
2861         if mobj is None:
2862             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2863             return
2864
2865         # extract uploader (which is in the url)
2866         uploader = mobj.group(1)
2867         # extract simple title (uploader + slug of song title)
2868         slug_title =  mobj.group(2)
2869         simple_title = uploader + u'-' + slug_title
2870
2871         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2872
2873         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2874         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2875         request = compat_urllib_request.Request(resolv_url)
2876         try:
2877             info_json_bytes = compat_urllib_request.urlopen(request).read()
2878             info_json = info_json_bytes.decode('utf-8')
2879         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2880             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2881             return
2882
2883         videos = []
2884         info = json.loads(info_json)
2885         if 'errors' in info:
2886             for err in info['errors']:
2887                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2888             return
2889
2890         for track in info['tracks']:
2891             video_id = track['id']
2892             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2893
2894             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2895             request = compat_urllib_request.Request(streams_url)
2896             try:
2897                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2898                 stream_json = stream_json_bytes.decode('utf-8')
2899             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2900                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2901                 return
2902
2903             streams = json.loads(stream_json)
2904             mediaURL = streams['http_mp3_128_url']
2905
2906             videos.append({
2907                 'id':       video_id,
2908                 'url':      mediaURL,
2909                 'uploader': track['user']['username'],
2910                 'upload_date':  track['created_at'],
2911                 'title':    track['title'],
2912                 'ext':      u'mp3',
2913                 'description': track['description'],
2914             })
2915         return videos
2916
2917
2918 class InfoQIE(InfoExtractor):
2919     """Information extractor for infoq.com"""
2920     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2921
2922     def report_extraction(self, video_id):
2923         """Report information extraction."""
2924         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2925
2926     def _real_extract(self, url):
2927         mobj = re.match(self._VALID_URL, url)
2928         if mobj is None:
2929             self._downloader.report_error(u'invalid URL: %s' % url)
2930             return
2931
2932         webpage = self._download_webpage(url, video_id=url)
2933         self.report_extraction(url)
2934
2935         # Extract video URL
2936         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2937         if mobj is None:
2938             self._downloader.report_error(u'unable to extract video url')
2939             return
2940         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2941         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2942
2943         # Extract title
2944         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2945         if mobj is None:
2946             self._downloader.report_error(u'unable to extract video title')
2947             return
2948         video_title = mobj.group(1)
2949
2950         # Extract description
2951         video_description = u'No description available.'
2952         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2953         if mobj is not None:
2954             video_description = mobj.group(1)
2955
2956         video_filename = video_url.split('/')[-1]
2957         video_id, extension = video_filename.split('.')
2958
2959         info = {
2960             'id': video_id,
2961             'url': video_url,
2962             'uploader': None,
2963             'upload_date': None,
2964             'title': video_title,
2965             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2966             'thumbnail': None,
2967             'description': video_description,
2968         }
2969
2970         return [info]
2971
2972 class MixcloudIE(InfoExtractor):
2973     """Information extractor for www.mixcloud.com"""
2974
2975     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2976     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2977     IE_NAME = u'mixcloud'
2978
2979     def __init__(self, downloader=None):
2980         InfoExtractor.__init__(self, downloader)
2981
2982     def report_download_json(self, file_id):
2983         """Report JSON download."""
2984         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2985
2986     def report_extraction(self, file_id):
2987         """Report information extraction."""
2988         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2989
2990     def get_urls(self, jsonData, fmt, bitrate='best'):
2991         """Get urls from 'audio_formats' section in json"""
2992         file_url = None
2993         try:
2994             bitrate_list = jsonData[fmt]
2995             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2996                 bitrate = max(bitrate_list) # select highest
2997
2998             url_list = jsonData[fmt][bitrate]
2999         except TypeError: # we have no bitrate info.
3000             url_list = jsonData[fmt]
3001         return url_list
3002
3003     def check_urls(self, url_list):
3004         """Returns 1st active url from list"""
3005         for url in url_list:
3006             try:
3007                 compat_urllib_request.urlopen(url)
3008                 return url
3009             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3010                 url = None
3011
3012         return None
3013
3014     def _print_formats(self, formats):
3015         print('Available formats:')
3016         for fmt in formats.keys():
3017             for b in formats[fmt]:
3018                 try:
3019                     ext = formats[fmt][b][0]
3020                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3021                 except TypeError: # we have no bitrate info
3022                     ext = formats[fmt][0]
3023                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3024                     break
3025
3026     def _real_extract(self, url):
3027         mobj = re.match(self._VALID_URL, url)
3028         if mobj is None:
3029             self._downloader.report_error(u'invalid URL: %s' % url)
3030             return
3031         # extract uploader & filename from url
3032         uploader = mobj.group(1).decode('utf-8')
3033         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3034
3035         # construct API request
3036         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3037         # retrieve .json file with links to files
3038         request = compat_urllib_request.Request(file_url)
3039         try:
3040             self.report_download_json(file_url)
3041             jsonData = compat_urllib_request.urlopen(request).read()
3042         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3043             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3044             return
3045
3046         # parse JSON
3047         json_data = json.loads(jsonData)
3048         player_url = json_data['player_swf_url']
3049         formats = dict(json_data['audio_formats'])
3050
3051         req_format = self._downloader.params.get('format', None)
3052         bitrate = None
3053
3054         if self._downloader.params.get('listformats', None):
3055             self._print_formats(formats)
3056             return
3057
3058         if req_format is None or req_format == 'best':
3059             for format_param in formats.keys():
3060                 url_list = self.get_urls(formats, format_param)
3061                 # check urls
3062                 file_url = self.check_urls(url_list)
3063                 if file_url is not None:
3064                     break # got it!
3065         else:
3066             if req_format not in formats:
3067                 self._downloader.report_error(u'format is not available')
3068                 return
3069
3070             url_list = self.get_urls(formats, req_format)
3071             file_url = self.check_urls(url_list)
3072             format_param = req_format
3073
3074         return [{
3075             'id': file_id.decode('utf-8'),
3076             'url': file_url.decode('utf-8'),
3077             'uploader': uploader.decode('utf-8'),
3078             'upload_date': None,
3079             'title': json_data['name'],
3080             'ext': file_url.split('.')[-1].decode('utf-8'),
3081             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3082             'thumbnail': json_data['thumbnail_url'],
3083             'description': json_data['description'],
3084             'player_url': player_url.decode('utf-8'),
3085         }]
3086
3087 class StanfordOpenClassroomIE(InfoExtractor):
3088     """Information extractor for Stanford's Open ClassRoom"""
3089
3090     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3091     IE_NAME = u'stanfordoc'
3092
3093     def report_download_webpage(self, objid):
3094         """Report information extraction."""
3095         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3096
3097     def report_extraction(self, video_id):
3098         """Report information extraction."""
3099         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3100
3101     def _real_extract(self, url):
3102         mobj = re.match(self._VALID_URL, url)
3103         if mobj is None:
3104             raise ExtractorError(u'Invalid URL: %s' % url)
3105
3106         if mobj.group('course') and mobj.group('video'): # A specific video
3107             course = mobj.group('course')
3108             video = mobj.group('video')
3109             info = {
3110                 'id': course + '_' + video,
3111                 'uploader': None,
3112                 'upload_date': None,
3113             }
3114
3115             self.report_extraction(info['id'])
3116             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3117             xmlUrl = baseUrl + video + '.xml'
3118             try:
3119                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3120             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3121                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3122                 return
3123             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3124             try:
3125                 info['title'] = mdoc.findall('./title')[0].text
3126                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3127             except IndexError:
3128                 self._downloader.report_error(u'Invalid metadata XML file')
3129                 return
3130             info['ext'] = info['url'].rpartition('.')[2]
3131             return [info]
3132         elif mobj.group('course'): # A course page
3133             course = mobj.group('course')
3134             info = {
3135                 'id': course,
3136                 'type': 'playlist',
3137                 'uploader': None,
3138                 'upload_date': None,
3139             }
3140
3141             coursepage = self._download_webpage(url, info['id'],
3142                                         note='Downloading course info page',
3143                                         errnote='Unable to download course info page')
3144
3145             m = re.search('<h1>([^<]+)</h1>', coursepage)
3146             if m:
3147                 info['title'] = unescapeHTML(m.group(1))
3148             else:
3149                 info['title'] = info['id']
3150
3151             m = re.search('<description>([^<]+)</description>', coursepage)
3152             if m:
3153                 info['description'] = unescapeHTML(m.group(1))
3154
3155             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3156             info['list'] = [
3157                 {
3158                     'type': 'reference',
3159                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3160                 }
3161                     for vpage in links]
3162             results = []
3163             for entry in info['list']:
3164                 assert entry['type'] == 'reference'
3165                 results += self.extract(entry['url'])
3166             return results
3167         else: # Root page
3168             info = {
3169                 'id': 'Stanford OpenClassroom',
3170                 'type': 'playlist',
3171                 'uploader': None,
3172                 'upload_date': None,
3173             }
3174
3175             self.report_download_webpage(info['id'])
3176             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3177             try:
3178                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3179             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3180                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3181                 return
3182
3183             info['title'] = info['id']
3184
3185             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3186             info['list'] = [
3187                 {
3188                     'type': 'reference',
3189                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3190                 }
3191                     for cpage in links]
3192
3193             results = []
3194             for entry in info['list']:
3195                 assert entry['type'] == 'reference'
3196                 results += self.extract(entry['url'])
3197             return results
3198
3199 class MTVIE(InfoExtractor):
3200     """Information extractor for MTV.com"""
3201
3202     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3203     IE_NAME = u'mtv'
3204
3205     def report_extraction(self, video_id):
3206         """Report information extraction."""
3207         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3208
3209     def _real_extract(self, url):
3210         mobj = re.match(self._VALID_URL, url)
3211         if mobj is None:
3212             self._downloader.report_error(u'invalid URL: %s' % url)
3213             return
3214         if not mobj.group('proto'):
3215             url = 'http://' + url
3216         video_id = mobj.group('videoid')
3217
3218         webpage = self._download_webpage(url, video_id)
3219
3220         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3221         if mobj is None:
3222             self._downloader.report_error(u'unable to extract song name')
3223             return
3224         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3225         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3226         if mobj is None:
3227             self._downloader.report_error(u'unable to extract performer')
3228             return
3229         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3230         video_title = performer + ' - ' + song_name
3231
3232         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3233         if mobj is None:
3234             self._downloader.report_error(u'unable to mtvn_uri')
3235             return
3236         mtvn_uri = mobj.group(1)
3237
3238         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3239         if mobj is None:
3240             self._downloader.report_error(u'unable to extract content id')
3241             return
3242         content_id = mobj.group(1)
3243
3244         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3245         self.report_extraction(video_id)
3246         request = compat_urllib_request.Request(videogen_url)
3247         try:
3248             metadataXml = compat_urllib_request.urlopen(request).read()
3249         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3250             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3251             return
3252
3253         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3254         renditions = mdoc.findall('.//rendition')
3255
3256         # For now, always pick the highest quality.
3257         rendition = renditions[-1]
3258
3259         try:
3260             _,_,ext = rendition.attrib['type'].partition('/')
3261             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3262             video_url = rendition.find('./src').text
3263         except KeyError:
3264             self._downloader.trouble('Invalid rendition field.')
3265             return
3266
3267         info = {
3268             'id': video_id,
3269             'url': video_url,
3270             'uploader': performer,
3271             'upload_date': None,
3272             'title': video_title,
3273             'ext': ext,
3274             'format': format,
3275         }
3276
3277         return [info]
3278
3279
3280 class YoukuIE(InfoExtractor):
3281     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3282
3283     def report_download_webpage(self, file_id):
3284         """Report webpage download."""
3285         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3286
3287     def report_extraction(self, file_id):
3288         """Report information extraction."""
3289         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3290
3291     def _gen_sid(self):
3292         nowTime = int(time.time() * 1000)
3293         random1 = random.randint(1000,1998)
3294         random2 = random.randint(1000,9999)
3295
3296         return "%d%d%d" %(nowTime,random1,random2)
3297
3298     def _get_file_ID_mix_string(self, seed):
3299         mixed = []
3300         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3301         seed = float(seed)
3302         for i in range(len(source)):
3303             seed  =  (seed * 211 + 30031 ) % 65536
3304             index  =  math.floor(seed / 65536 * len(source) )
3305             mixed.append(source[int(index)])
3306             source.remove(source[int(index)])
3307         #return ''.join(mixed)
3308         return mixed
3309
3310     def _get_file_id(self, fileId, seed):
3311         mixed = self._get_file_ID_mix_string(seed)
3312         ids = fileId.split('*')
3313         realId = []
3314         for ch in ids:
3315             if ch:
3316                 realId.append(mixed[int(ch)])
3317         return ''.join(realId)
3318
3319     def _real_extract(self, url):
3320         mobj = re.match(self._VALID_URL, url)
3321         if mobj is None:
3322             self._downloader.report_error(u'invalid URL: %s' % url)
3323             return
3324         video_id = mobj.group('ID')
3325
3326         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3327
3328         request = compat_urllib_request.Request(info_url, None, std_headers)
3329         try:
3330             self.report_download_webpage(video_id)
3331             jsondata = compat_urllib_request.urlopen(request).read()
3332         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3333             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3334             return
3335
3336         self.report_extraction(video_id)
3337         try:
3338             jsonstr = jsondata.decode('utf-8')
3339             config = json.loads(jsonstr)
3340
3341             video_title =  config['data'][0]['title']
3342             seed = config['data'][0]['seed']
3343
3344             format = self._downloader.params.get('format', None)
3345             supported_format = list(config['data'][0]['streamfileids'].keys())
3346
3347             if format is None or format == 'best':
3348                 if 'hd2' in supported_format:
3349                     format = 'hd2'
3350                 else:
3351                     format = 'flv'
3352                 ext = u'flv'
3353             elif format == 'worst':
3354                 format = 'mp4'
3355                 ext = u'mp4'
3356             else:
3357                 format = 'flv'
3358                 ext = u'flv'
3359
3360
3361             fileid = config['data'][0]['streamfileids'][format]
3362             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3363         except (UnicodeDecodeError, ValueError, KeyError):
3364             self._downloader.report_error(u'unable to extract info section')
3365             return
3366
3367         files_info=[]
3368         sid = self._gen_sid()
3369         fileid = self._get_file_id(fileid, seed)
3370
3371         #column 8,9 of fileid represent the segment number
3372         #fileid[7:9] should be changed
3373         for index, key in enumerate(keys):
3374
3375             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3376             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3377
3378             info = {
3379                 'id': '%s_part%02d' % (video_id, index),
3380                 'url': download_url,
3381                 'uploader': None,
3382                 'upload_date': None,
3383                 'title': video_title,
3384                 'ext': ext,
3385             }
3386             files_info.append(info)
3387
3388         return files_info
3389
3390
3391 class XNXXIE(InfoExtractor):
3392     """Information extractor for xnxx.com"""
3393
3394     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3395     IE_NAME = u'xnxx'
3396     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3397     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3398     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3399
3400     def report_webpage(self, video_id):
3401         """Report information extraction"""
3402         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3403
3404     def report_extraction(self, video_id):
3405         """Report information extraction"""
3406         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3407
3408     def _real_extract(self, url):
3409         mobj = re.match(self._VALID_URL, url)
3410         if mobj is None:
3411             self._downloader.report_error(u'invalid URL: %s' % url)
3412             return
3413         video_id = mobj.group(1)
3414
3415         self.report_webpage(video_id)
3416
3417         # Get webpage content
3418         try:
3419             webpage_bytes = compat_urllib_request.urlopen(url).read()
3420             webpage = webpage_bytes.decode('utf-8')
3421         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3422             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3423             return
3424
3425         result = re.search(self.VIDEO_URL_RE, webpage)
3426         if result is None:
3427             self._downloader.report_error(u'unable to extract video url')
3428             return
3429         video_url = compat_urllib_parse.unquote(result.group(1))
3430
3431         result = re.search(self.VIDEO_TITLE_RE, webpage)
3432         if result is None:
3433             self._downloader.report_error(u'unable to extract video title')
3434             return
3435         video_title = result.group(1)
3436
3437         result = re.search(self.VIDEO_THUMB_RE, webpage)
3438         if result is None:
3439             self._downloader.report_error(u'unable to extract video thumbnail')
3440             return
3441         video_thumbnail = result.group(1)
3442
3443         return [{
3444             'id': video_id,
3445             'url': video_url,
3446             'uploader': None,
3447             'upload_date': None,
3448             'title': video_title,
3449             'ext': 'flv',
3450             'thumbnail': video_thumbnail,
3451             'description': None,
3452         }]
3453
3454
3455 class GooglePlusIE(InfoExtractor):
3456     """Information extractor for plus.google.com."""
3457
3458     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3459     IE_NAME = u'plus.google'
3460
3461     def __init__(self, downloader=None):
3462         InfoExtractor.__init__(self, downloader)
3463
3464     def report_extract_entry(self, url):
3465         """Report downloading extry"""
3466         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3467
3468     def report_date(self, upload_date):
3469         """Report downloading extry"""
3470         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3471
3472     def report_uploader(self, uploader):
3473         """Report downloading extry"""
3474         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3475
3476     def report_title(self, video_title):
3477         """Report downloading extry"""
3478         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3479
3480     def report_extract_vid_page(self, video_page):
3481         """Report information extraction."""
3482         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3483
3484     def _real_extract(self, url):
3485         # Extract id from URL
3486         mobj = re.match(self._VALID_URL, url)
3487         if mobj is None:
3488             self._downloader.report_error(u'Invalid URL: %s' % url)
3489             return
3490
3491         post_url = mobj.group(0)
3492         video_id = mobj.group(1)
3493
3494         video_extension = 'flv'
3495
3496         # Step 1, Retrieve post webpage to extract further information
3497         self.report_extract_entry(post_url)
3498         request = compat_urllib_request.Request(post_url)
3499         try:
3500             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3501         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3502             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3503             return
3504
3505         # Extract update date
3506         upload_date = None
3507         pattern = 'title="Timestamp">(.*?)</a>'
3508         mobj = re.search(pattern, webpage)
3509         if mobj:
3510             upload_date = mobj.group(1)
3511             # Convert timestring to a format suitable for filename
3512             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3513             upload_date = upload_date.strftime('%Y%m%d')
3514         self.report_date(upload_date)
3515
3516         # Extract uploader
3517         uploader = None
3518         pattern = r'rel\="author".*?>(.*?)</a>'
3519         mobj = re.search(pattern, webpage)
3520         if mobj:
3521             uploader = mobj.group(1)
3522         self.report_uploader(uploader)
3523
3524         # Extract title
3525         # Get the first line for title
3526         video_title = u'NA'
3527         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3528         mobj = re.search(pattern, webpage)
3529         if mobj:
3530             video_title = mobj.group(1)
3531         self.report_title(video_title)
3532
3533         # Step 2, Stimulate clicking the image box to launch video
3534         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3535         mobj = re.search(pattern, webpage)
3536         if mobj is None:
3537             self._downloader.report_error(u'unable to extract video page URL')
3538
3539         video_page = mobj.group(1)
3540         request = compat_urllib_request.Request(video_page)
3541         try:
3542             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3543         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3544             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3545             return
3546         self.report_extract_vid_page(video_page)
3547
3548
3549         # Extract video links on video page
3550         """Extract video links of all sizes"""
3551         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3552         mobj = re.findall(pattern, webpage)
3553         if len(mobj) == 0:
3554             self._downloader.report_error(u'unable to extract video links')
3555
3556         # Sort in resolution
3557         links = sorted(mobj)
3558
3559         # Choose the lowest of the sort, i.e. highest resolution
3560         video_url = links[-1]
3561         # Only get the url. The resolution part in the tuple has no use anymore
3562         video_url = video_url[-1]
3563         # Treat escaped \u0026 style hex
3564         try:
3565             video_url = video_url.decode("unicode_escape")
3566         except AttributeError: # Python 3
3567             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3568
3569
3570         return [{
3571             'id':       video_id,
3572             'url':      video_url,
3573             'uploader': uploader,
3574             'upload_date':  upload_date,
3575             'title':    video_title,
3576             'ext':      video_extension,
3577         }]
3578
3579 class NBAIE(InfoExtractor):
3580     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3581     IE_NAME = u'nba'
3582
3583     def _real_extract(self, url):
3584         mobj = re.match(self._VALID_URL, url)
3585         if mobj is None:
3586             self._downloader.report_error(u'invalid URL: %s' % url)
3587             return
3588
3589         video_id = mobj.group(1)
3590         if video_id.endswith('/index.html'):
3591             video_id = video_id[:-len('/index.html')]
3592
3593         webpage = self._download_webpage(url, video_id)
3594
3595         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3596         def _findProp(rexp, default=None):
3597             m = re.search(rexp, webpage)
3598             if m:
3599                 return unescapeHTML(m.group(1))
3600             else:
3601                 return default
3602
3603         shortened_video_id = video_id.rpartition('/')[2]
3604         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3605         info = {
3606             'id': shortened_video_id,
3607             'url': video_url,
3608             'ext': 'mp4',
3609             'title': title,
3610             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3611             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3612         }
3613         return [info]
3614
3615 class JustinTVIE(InfoExtractor):
3616     """Information extractor for justin.tv and twitch.tv"""
3617     # TODO: One broadcast may be split into multiple videos. The key
3618     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3619     # starts at 1 and increases. Can we treat all parts as one video?
3620
3621     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3622         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3623     _JUSTIN_PAGE_LIMIT = 100
3624     IE_NAME = u'justin.tv'
3625
3626     def report_extraction(self, file_id):
3627         """Report information extraction."""
3628         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3629
3630     def report_download_page(self, channel, offset):
3631         """Report attempt to download a single page of videos."""
3632         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3633                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3634
3635     # Return count of items, list of *valid* items
3636     def _parse_page(self, url):
3637         try:
3638             urlh = compat_urllib_request.urlopen(url)
3639             webpage_bytes = urlh.read()
3640             webpage = webpage_bytes.decode('utf-8', 'ignore')
3641         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3642             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3643             return
3644
3645         response = json.loads(webpage)
3646         if type(response) != list:
3647             error_text = response.get('error', 'unknown error')
3648             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3649             return
3650         info = []
3651         for clip in response:
3652             video_url = clip['video_file_url']
3653             if video_url:
3654                 video_extension = os.path.splitext(video_url)[1][1:]
3655                 video_date = re.sub('-', '', clip['start_time'][:10])
3656                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3657                 video_id = clip['id']
3658                 video_title = clip.get('title', video_id)
3659                 info.append({
3660                     'id': video_id,
3661                     'url': video_url,
3662                     'title': video_title,
3663                     'uploader': clip.get('channel_name', video_uploader_id),
3664                     'uploader_id': video_uploader_id,
3665                     'upload_date': video_date,
3666                     'ext': video_extension,
3667                 })
3668         return (len(response), info)
3669
3670     def _real_extract(self, url):
3671         mobj = re.match(self._VALID_URL, url)
3672         if mobj is None:
3673             self._downloader.report_error(u'invalid URL: %s' % url)
3674             return
3675
3676         api = 'http://api.justin.tv'
3677         video_id = mobj.group(mobj.lastindex)
3678         paged = False
3679         if mobj.lastindex == 1:
3680             paged = True
3681             api += '/channel/archives/%s.json'
3682         else:
3683             api += '/broadcast/by_archive/%s.json'
3684         api = api % (video_id,)
3685
3686         self.report_extraction(video_id)
3687
3688         info = []
3689         offset = 0
3690         limit = self._JUSTIN_PAGE_LIMIT
3691         while True:
3692             if paged:
3693                 self.report_download_page(video_id, offset)
3694             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3695             page_count, page_info = self._parse_page(page_url)
3696             info.extend(page_info)
3697             if not paged or page_count != limit:
3698                 break
3699             offset += limit
3700         return info
3701
3702 class FunnyOrDieIE(InfoExtractor):
3703     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3704
3705     def _real_extract(self, url):
3706         mobj = re.match(self._VALID_URL, url)
3707         if mobj is None:
3708             self._downloader.report_error(u'invalid URL: %s' % url)
3709             return
3710
3711         video_id = mobj.group('id')
3712         webpage = self._download_webpage(url, video_id)
3713
3714         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3715         if not m:
3716             self._downloader.report_error(u'unable to find video information')
3717         video_url = unescapeHTML(m.group('url'))
3718
3719         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3720         if not m:
3721             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3722             if not m:
3723                 self._downloader.trouble(u'Cannot find video title')
3724         title = clean_html(m.group('title'))
3725
3726         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3727         if m:
3728             desc = unescapeHTML(m.group('desc'))
3729         else:
3730             desc = None
3731
3732         info = {
3733             'id': video_id,
3734             'url': video_url,
3735             'ext': 'mp4',
3736             'title': title,
3737             'description': desc,
3738         }
3739         return [info]
3740
3741 class SteamIE(InfoExtractor):
3742     _VALID_URL = r"""http://store.steampowered.com/
3743                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3744                 (?P<gameID>\d+)/?
3745                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3746                 """
3747
3748     @classmethod
3749     def suitable(cls, url):
3750         """Receives a URL and returns True if suitable for this IE."""
3751         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3752
3753     def _real_extract(self, url):
3754         m = re.match(self._VALID_URL, url, re.VERBOSE)
3755         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3756         gameID = m.group('gameID')
3757         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3758         webpage = self._download_webpage(videourl, gameID)
3759         mweb = re.finditer(urlRE, webpage)
3760         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3761         titles = re.finditer(namesRE, webpage)
3762         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3763         thumbs = re.finditer(thumbsRE, webpage)
3764         videos = []
3765         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3766             video_id = vid.group('videoID')
3767             title = vtitle.group('videoName')
3768             video_url = vid.group('videoURL')
3769             video_thumb = thumb.group('thumbnail')
3770             if not video_url:
3771                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3772             info = {
3773                 'id':video_id,
3774                 'url':video_url,
3775                 'ext': 'flv',
3776                 'title': unescapeHTML(title),
3777                 'thumbnail': video_thumb
3778                   }
3779             videos.append(info)
3780         return videos
3781
3782 class UstreamIE(InfoExtractor):
3783     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3784     IE_NAME = u'ustream'
3785
3786     def _real_extract(self, url):
3787         m = re.match(self._VALID_URL, url)
3788         video_id = m.group('videoID')
3789         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3790         webpage = self._download_webpage(url, video_id)
3791         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3792         title = m.group('title')
3793         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3794         uploader = m.group('uploader')
3795         info = {
3796                 'id':video_id,
3797                 'url':video_url,
3798                 'ext': 'flv',
3799                 'title': title,
3800                 'uploader': uploader
3801                   }
3802         return [info]
3803
3804 class WorldStarHipHopIE(InfoExtractor):
3805     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3806     IE_NAME = u'WorldStarHipHop'
3807
3808     def _real_extract(self, url):
3809         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3810
3811         webpage_src = compat_urllib_request.urlopen(url).read()
3812         webpage_src = webpage_src.decode('utf-8')
3813
3814         mobj = re.search(_src_url, webpage_src)
3815
3816         m = re.match(self._VALID_URL, url)
3817         video_id = m.group('id')
3818
3819         if mobj is not None:
3820             video_url = mobj.group()
3821             if 'mp4' in video_url:
3822                 ext = 'mp4'
3823             else:
3824                 ext = 'flv'
3825         else:
3826             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3827             return
3828
3829         _title = r"""<title>(.*)</title>"""
3830
3831         mobj = re.search(_title, webpage_src)
3832
3833         if mobj is not None:
3834             title = mobj.group(1)
3835         else:
3836             title = 'World Start Hip Hop - %s' % time.ctime()
3837
3838         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3839         mobj = re.search(_thumbnail, webpage_src)
3840
3841         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3842         if mobj is not None:
3843             thumbnail = mobj.group(1)
3844         else:
3845             _title = r"""candytitles.*>(.*)</span>"""
3846             mobj = re.search(_title, webpage_src)
3847             if mobj is not None:
3848                 title = mobj.group(1)
3849             thumbnail = None
3850
3851         results = [{
3852                     'id': video_id,
3853                     'url' : video_url,
3854                     'title' : title,
3855                     'thumbnail' : thumbnail,
3856                     'ext' : ext,
3857                     }]
3858         return results
3859
3860 class RBMARadioIE(InfoExtractor):
3861     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3862
3863     def _real_extract(self, url):
3864         m = re.match(self._VALID_URL, url)
3865         video_id = m.group('videoID')
3866
3867         webpage = self._download_webpage(url, video_id)
3868         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3869         if not m:
3870             raise ExtractorError(u'Cannot find metadata')
3871         json_data = m.group(1)
3872
3873         try:
3874             data = json.loads(json_data)
3875         except ValueError as e:
3876             raise ExtractorError(u'Invalid JSON: ' + str(e))
3877
3878         video_url = data['akamai_url'] + '&cbr=256'
3879         url_parts = compat_urllib_parse_urlparse(video_url)
3880         video_ext = url_parts.path.rpartition('.')[2]
3881         info = {
3882                 'id': video_id,
3883                 'url': video_url,
3884                 'ext': video_ext,
3885                 'title': data['title'],
3886                 'description': data.get('teaser_text'),
3887                 'location': data.get('country_of_origin'),
3888                 'uploader': data.get('host', {}).get('name'),
3889                 'uploader_id': data.get('host', {}).get('slug'),
3890                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3891                 'duration': data.get('duration'),
3892         }
3893         return [info]
3894
3895
3896 class YouPornIE(InfoExtractor):
3897     """Information extractor for youporn.com."""
3898     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3899
3900     def _print_formats(self, formats):
3901         """Print all available formats"""
3902         print(u'Available formats:')
3903         print(u'ext\t\tformat')
3904         print(u'---------------------------------')
3905         for format in formats:
3906             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3907
3908     def _specific(self, req_format, formats):
3909         for x in formats:
3910             if(x["format"]==req_format):
3911                 return x
3912         return None
3913
3914     def _real_extract(self, url):
3915         mobj = re.match(self._VALID_URL, url)
3916         if mobj is None:
3917             self._downloader.report_error(u'invalid URL: %s' % url)
3918             return
3919
3920         video_id = mobj.group('videoid')
3921
3922         req = compat_urllib_request.Request(url)
3923         req.add_header('Cookie', 'age_verified=1')
3924         webpage = self._download_webpage(req, video_id)
3925
3926         # Get the video title
3927         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3928         if result is None:
3929             raise ExtractorError(u'Unable to extract video title')
3930         video_title = result.group('title').strip()
3931
3932         # Get the video date
3933         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3934         if result is None:
3935             self._downloader.report_warning(u'unable to extract video date')
3936             upload_date = None
3937         else:
3938             upload_date = result.group('date').strip()
3939
3940         # Get the video uploader
3941         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3942         if result is None:
3943             self._downloader.report_warning(u'unable to extract uploader')
3944             video_uploader = None
3945         else:
3946             video_uploader = result.group('uploader').strip()
3947             video_uploader = clean_html( video_uploader )
3948
3949         # Get all of the formats available
3950         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3951         result = re.search(DOWNLOAD_LIST_RE, webpage)
3952         if result is None:
3953             raise ExtractorError(u'Unable to extract download list')
3954         download_list_html = result.group('download_list').strip()
3955
3956         # Get all of the links from the page
3957         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3958         links = re.findall(LINK_RE, download_list_html)
3959         if(len(links) == 0):
3960             raise ExtractorError(u'ERROR: no known formats available for video')
3961
3962         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3963
3964         formats = []
3965         for link in links:
3966
3967             # A link looks like this:
3968             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3969             # A path looks like this:
3970             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3971             video_url = unescapeHTML( link )
3972             path = compat_urllib_parse_urlparse( video_url ).path
3973             extension = os.path.splitext( path )[1][1:]
3974             format = path.split('/')[4].split('_')[:2]
3975             size = format[0]
3976             bitrate = format[1]
3977             format = "-".join( format )
3978             title = u'%s-%s-%s' % (video_title, size, bitrate)
3979
3980             formats.append({
3981                 'id': video_id,
3982                 'url': video_url,
3983                 'uploader': video_uploader,
3984                 'upload_date': upload_date,
3985                 'title': title,
3986                 'ext': extension,
3987                 'format': format,
3988                 'thumbnail': None,
3989                 'description': None,
3990                 'player_url': None
3991             })
3992
3993         if self._downloader.params.get('listformats', None):
3994             self._print_formats(formats)
3995             return
3996
3997         req_format = self._downloader.params.get('format', None)
3998         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3999
4000         if req_format is None or req_format == 'best':
4001             return [formats[0]]
4002         elif req_format == 'worst':
4003             return [formats[-1]]
4004         elif req_format in ('-1', 'all'):
4005             return formats
4006         else:
4007             format = self._specific( req_format, formats )
4008             if result is None:
4009                 self._downloader.report_error(u'requested format not available')
4010                 return
4011             return [format]
4012
4013
4014
4015 class PornotubeIE(InfoExtractor):
4016     """Information extractor for pornotube.com."""
4017     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4018
4019     def _real_extract(self, url):
4020         mobj = re.match(self._VALID_URL, url)
4021         if mobj is None:
4022             self._downloader.report_error(u'invalid URL: %s' % url)
4023             return
4024
4025         video_id = mobj.group('videoid')
4026         video_title = mobj.group('title')
4027
4028         # Get webpage content
4029         webpage = self._download_webpage(url, video_id)
4030
4031         # Get the video URL
4032         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4033         result = re.search(VIDEO_URL_RE, webpage)
4034         if result is None:
4035             self._downloader.report_error(u'unable to extract video url')
4036             return
4037         video_url = compat_urllib_parse.unquote(result.group('url'))
4038
4039         #Get the uploaded date
4040         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4041         result = re.search(VIDEO_UPLOADED_RE, webpage)
4042         if result is None:
4043             self._downloader.report_error(u'unable to extract video title')
4044             return
4045         upload_date = result.group('date')
4046
4047         info = {'id': video_id,
4048                 'url': video_url,
4049                 'uploader': None,
4050                 'upload_date': upload_date,
4051                 'title': video_title,
4052                 'ext': 'flv',
4053                 'format': 'flv'}
4054
4055         return [info]
4056
4057 class YouJizzIE(InfoExtractor):
4058     """Information extractor for youjizz.com."""
4059     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4060
4061     def _real_extract(self, url):
4062         mobj = re.match(self._VALID_URL, url)
4063         if mobj is None:
4064             self._downloader.report_error(u'invalid URL: %s' % url)
4065             return
4066
4067         video_id = mobj.group('videoid')
4068
4069         # Get webpage content
4070         webpage = self._download_webpage(url, video_id)
4071
4072         # Get the video title
4073         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4074         if result is None:
4075             raise ExtractorError(u'ERROR: unable to extract video title')
4076         video_title = result.group('title').strip()
4077
4078         # Get the embed page
4079         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4080         if result is None:
4081             raise ExtractorError(u'ERROR: unable to extract embed page')
4082
4083         embed_page_url = result.group(0).strip()
4084         video_id = result.group('videoid')
4085
4086         webpage = self._download_webpage(embed_page_url, video_id)
4087
4088         # Get the video URL
4089         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4090         if result is None:
4091             raise ExtractorError(u'ERROR: unable to extract video url')
4092         video_url = result.group('source')
4093
4094         info = {'id': video_id,
4095                 'url': video_url,
4096                 'title': video_title,
4097                 'ext': 'flv',
4098                 'format': 'flv',
4099                 'player_url': embed_page_url}
4100
4101         return [info]
4102
4103 class EightTracksIE(InfoExtractor):
4104     IE_NAME = '8tracks'
4105     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4106
4107     def _real_extract(self, url):
4108         mobj = re.match(self._VALID_URL, url)
4109         if mobj is None:
4110             raise ExtractorError(u'Invalid URL: %s' % url)
4111         playlist_id = mobj.group('id')
4112
4113         webpage = self._download_webpage(url, playlist_id)
4114
4115         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4116         if not m:
4117             raise ExtractorError(u'Cannot find trax information')
4118         json_like = m.group(1)
4119         data = json.loads(json_like)
4120
4121         session = str(random.randint(0, 1000000000))
4122         mix_id = data['id']
4123         track_count = data['tracks_count']
4124         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4125         next_url = first_url
4126         res = []
4127         for i in itertools.count():
4128             api_json = self._download_webpage(next_url, playlist_id,
4129                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4130                 errnote=u'Failed to download song information')
4131             api_data = json.loads(api_json)
4132             track_data = api_data[u'set']['track']
4133             info = {
4134                 'id': track_data['id'],
4135                 'url': track_data['track_file_stream_url'],
4136                 'title': track_data['performer'] + u' - ' + track_data['name'],
4137                 'raw_title': track_data['name'],
4138                 'uploader_id': data['user']['login'],
4139                 'ext': 'm4a',
4140             }
4141             res.append(info)
4142             if api_data['set']['at_last_track']:
4143                 break
4144             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4145         return res
4146
4147 class KeekIE(InfoExtractor):
4148     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4149     IE_NAME = u'keek'
4150
4151     def _real_extract(self, url):
4152         m = re.match(self._VALID_URL, url)
4153         video_id = m.group('videoID')
4154         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4155         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4156         webpage = self._download_webpage(url, video_id)
4157         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4158         title = unescapeHTML(m.group('title'))
4159         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4160         uploader = clean_html(m.group('uploader'))
4161         info = {
4162                 'id': video_id,
4163                 'url': video_url,
4164                 'ext': 'mp4',
4165                 'title': title,
4166                 'thumbnail': thumbnail,
4167                 'uploader': uploader
4168         }
4169         return [info]
4170
4171 class TEDIE(InfoExtractor):
4172     _VALID_URL=r'''http://www.ted.com/
4173                    (
4174                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4175                         |
4176                         ((?P<type_talk>talks)) # We have a simple talk
4177                    )
4178                    /(?P<name>\w+) # Here goes the name and then ".html"
4179                    '''
4180
4181     @classmethod
4182     def suitable(cls, url):
4183         """Receives a URL and returns True if suitable for this IE."""
4184         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4185
4186     def _real_extract(self, url):
4187         m=re.match(self._VALID_URL, url, re.VERBOSE)
4188         if m.group('type_talk'):
4189             return [self._talk_info(url)]
4190         else :
4191             playlist_id=m.group('playlist_id')
4192             name=m.group('name')
4193             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4194             return self._playlist_videos_info(url,name,playlist_id)
4195
4196     def _talk_video_link(self,mediaSlug):
4197         '''Returns the video link for that mediaSlug'''
4198         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4199
4200     def _playlist_videos_info(self,url,name,playlist_id=0):
4201         '''Returns the videos of the playlist'''
4202         video_RE=r'''
4203                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4204                      ([.\s]*?)data-playlist_item_id="(\d+)"
4205                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4206                      '''
4207         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4208         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4209         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4210         m_names=re.finditer(video_name_RE,webpage)
4211         info=[]
4212         for m_video, m_name in zip(m_videos,m_names):
4213             video_id=m_video.group('video_id')
4214             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4215             info.append(self._talk_info(talk_url,video_id))
4216         return info
4217
4218     def _talk_info(self, url, video_id=0):
4219         """Return the video for the talk in the url"""
4220         m=re.match(self._VALID_URL, url,re.VERBOSE)
4221         videoName=m.group('name')
4222         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4223         # If the url includes the language we get the title translated
4224         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4225         title=re.search(title_RE, webpage).group('title')
4226         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4227                         "id":(?P<videoID>[\d]+).*?
4228                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4229         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4230         thumb_match=re.search(thumb_RE,webpage)
4231         info_match=re.search(info_RE,webpage,re.VERBOSE)
4232         video_id=info_match.group('videoID')
4233         mediaSlug=info_match.group('mediaSlug')
4234         video_url=self._talk_video_link(mediaSlug)
4235         info = {
4236                 'id': video_id,
4237                 'url': video_url,
4238                 'ext': 'mp4',
4239                 'title': title,
4240                 'thumbnail': thumb_match.group('thumbnail')
4241                 }
4242         return info
4243
4244 class MySpassIE(InfoExtractor):
4245     _VALID_URL = r'http://www.myspass.de/.*'
4246
4247     def _real_extract(self, url):
4248         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4249
4250         # video id is the last path element of the URL
4251         # usually there is a trailing slash, so also try the second but last
4252         url_path = compat_urllib_parse_urlparse(url).path
4253         url_parent_path, video_id = os.path.split(url_path)
4254         if not video_id:
4255             _, video_id = os.path.split(url_parent_path)
4256
4257         # get metadata
4258         metadata_url = META_DATA_URL_TEMPLATE % video_id
4259         metadata_text = self._download_webpage(metadata_url, video_id)
4260         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4261
4262         # extract values from metadata
4263         url_flv_el = metadata.find('url_flv')
4264         if url_flv_el is None:
4265             self._downloader.report_error(u'unable to extract download url')
4266             return
4267         video_url = url_flv_el.text
4268         extension = os.path.splitext(video_url)[1][1:]
4269         title_el = metadata.find('title')
4270         if title_el is None:
4271             self._downloader.report_error(u'unable to extract title')
4272             return
4273         title = title_el.text
4274         format_id_el = metadata.find('format_id')
4275         if format_id_el is None:
4276             format = ext
4277         else:
4278             format = format_id_el.text
4279         description_el = metadata.find('description')
4280         if description_el is not None:
4281             description = description_el.text
4282         else:
4283             description = None
4284         imagePreview_el = metadata.find('imagePreview')
4285         if imagePreview_el is not None:
4286             thumbnail = imagePreview_el.text
4287         else:
4288             thumbnail = None
4289         info = {
4290             'id': video_id,
4291             'url': video_url,
4292             'title': title,
4293             'ext': extension,
4294             'format': format,
4295             'thumbnail': thumbnail,
4296             'description': description
4297         }
4298         return [info]
4299
4300 class SpiegelIE(InfoExtractor):
4301     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4302
4303     def _real_extract(self, url):
4304         m = re.match(self._VALID_URL, url)
4305         video_id = m.group('videoID')
4306
4307         webpage = self._download_webpage(url, video_id)
4308         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4309         if not m:
4310             raise ExtractorError(u'Cannot find title')
4311         video_title = unescapeHTML(m.group(1))
4312
4313         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4314         xml_code = self._download_webpage(xml_url, video_id,
4315                     note=u'Downloading XML', errnote=u'Failed to download XML')
4316
4317         idoc = xml.etree.ElementTree.fromstring(xml_code)
4318         last_type = idoc[-1]
4319         filename = last_type.findall('./filename')[0].text
4320         duration = float(last_type.findall('./duration')[0].text)
4321
4322         video_url = 'http://video2.spiegel.de/flash/' + filename
4323         video_ext = filename.rpartition('.')[2]
4324         info = {
4325             'id': video_id,
4326             'url': video_url,
4327             'ext': video_ext,
4328             'title': video_title,
4329             'duration': duration,
4330         }
4331         return [info]
4332
4333 class LiveLeakIE(InfoExtractor):
4334
4335     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4336     IE_NAME = u'liveleak'
4337
4338     def _real_extract(self, url):
4339         mobj = re.match(self._VALID_URL, url)
4340         if mobj is None:
4341             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4342             return
4343
4344         video_id = mobj.group('video_id')
4345
4346         webpage = self._download_webpage(url, video_id)
4347
4348         m = re.search(r'file: "(.*?)",', webpage)
4349         if not m:
4350             self._downloader.report_error(u'unable to find video url')
4351             return
4352         video_url = m.group(1)
4353
4354         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4355         if not m:
4356             self._downloader.trouble(u'Cannot find video title')
4357         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4358
4359         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4360         if m:
4361             desc = unescapeHTML(m.group('desc'))
4362         else:
4363             desc = None
4364
4365         m = re.search(r'By:.*?(\w+)</a>', webpage)
4366         if m:
4367             uploader = clean_html(m.group(1))
4368         else:
4369             uploader = None
4370
4371         info = {
4372             'id':  video_id,
4373             'url': video_url,
4374             'ext': 'mp4',
4375             'title': title,
4376             'description': desc,
4377             'uploader': uploader
4378         }
4379
4380         return [info]
4381
4382 class ARDIE(InfoExtractor):
4383     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4384     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4385     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4386
4387     def _real_extract(self, url):
4388         # determine video id from url
4389         m = re.match(self._VALID_URL, url)
4390
4391         numid = re.search(r'documentId=([0-9]+)', url)
4392         if numid:
4393             video_id = numid.group(1)
4394         else:
4395             video_id = m.group('video_id')
4396
4397         # determine title and media streams from webpage
4398         html = self._download_webpage(url, video_id)
4399         title = re.search(self._TITLE, html).group('title')
4400         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4401         if not streams:
4402             assert '"fsk"' in html
4403             self._downloader.report_error(u'this video is only available after 8:00 pm')
4404             return
4405
4406         # choose default media type and highest quality for now
4407         stream = max([s for s in streams if int(s["media_type"]) == 0],
4408                      key=lambda s: int(s["quality"]))
4409
4410         # there's two possibilities: RTMP stream or HTTP download
4411         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4412         if stream['rtmp_url']:
4413             self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4414             assert stream['video_url'].startswith('mp4:')
4415             info["url"] = stream["rtmp_url"]
4416             info["play_path"] = stream['video_url']
4417         else:
4418             assert stream["video_url"].endswith('.mp4')
4419             info["url"] = stream["video_url"]
4420         return [info]
4421
4422
4423 def gen_extractors():
4424     """ Return a list of an instance of every supported extractor.
4425     The order does matter; the first extractor matched is the one handling the URL.
4426     """
4427     return [
4428         YoutubePlaylistIE(),
4429         YoutubeChannelIE(),
4430         YoutubeUserIE(),
4431         YoutubeSearchIE(),
4432         YoutubeIE(),
4433         MetacafeIE(),
4434         DailymotionIE(),
4435         GoogleSearchIE(),
4436         PhotobucketIE(),
4437         YahooIE(),
4438         YahooSearchIE(),
4439         DepositFilesIE(),
4440         FacebookIE(),
4441         BlipTVUserIE(),
4442         BlipTVIE(),
4443         VimeoIE(),
4444         MyVideoIE(),
4445         ComedyCentralIE(),
4446         EscapistIE(),
4447         CollegeHumorIE(),
4448         XVideosIE(),
4449         SoundcloudSetIE(),
4450         SoundcloudIE(),
4451         InfoQIE(),
4452         MixcloudIE(),
4453         StanfordOpenClassroomIE(),
4454         MTVIE(),
4455         YoukuIE(),
4456         XNXXIE(),
4457         YouJizzIE(),
4458         PornotubeIE(),
4459         YouPornIE(),
4460         GooglePlusIE(),
4461         ArteTvIE(),
4462         NBAIE(),
4463         WorldStarHipHopIE(),
4464         JustinTVIE(),
4465         FunnyOrDieIE(),
4466         SteamIE(),
4467         UstreamIE(),
4468         RBMARadioIE(),
4469         EightTracksIE(),
4470         KeekIE(),
4471         TEDIE(),
4472         MySpassIE(),
4473         SpiegelIE(),
4474         LiveLeakIE(),
4475         ARDIE(),
4476         GenericIE()
4477     ]