youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18
  19 from .utils import *
  20
  21
  22 class InfoExtractor(object):
  23     """Information Extractor class.
  24
  25     Information extractors are the classes that, given a URL, extract
  26     information about the video (or videos) the URL refers to. This
  27     information includes the real video URL, the video title, author and
  28     others. The information is stored in a dictionary which is then
  29     passed to the FileDownloader. The FileDownloader processes this
  30     information possibly downloading the video to the file system, among
  31     other possible outcomes.
  32
  33     The dictionaries must include the following fields:
  34
  35     id:             Video identifier.
  36     url:            Final video URL.
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader:       Full name of the video uploader.
  46     upload_date:    Video upload date (YYYYMMDD).
  47     uploader_id:    Nickname or id of the video uploader.
  48     location:       Physical location of the video.
  49     player_url:     SWF Player URL (used for rtmpdump).
  50     subtitles:      The .srt file contents.
  51     urlhandle:      [internal] The urlHandle to be used to download the file,
  52                     like returned by urllib.request.urlopen
  53
  54     The fields should all be Unicode strings.
  55
  56     Subclasses of this one should re-define the _real_initialize() and
  57     _real_extract() methods and define a _VALID_URL regexp.
  58     Probably, they should also be added to the list of extractors.
  59
  60     _real_extract() must return a *list* of information dictionaries as
  61     described above.
  62
  63     Finally, the _WORKING attribute should be set to False for broken IEs
  64     in order to warn the users and skip the tests.
  65     """
  66
  67     _ready = False
  68     _downloader = None
  69     _WORKING = True
  70
  71     def __init__(self, downloader=None):
  72         """Constructor. Receives an optional downloader."""
  73         self._ready = False
  74         self.set_downloader(downloader)
  75
  76     def suitable(self, url):
  77         """Receives a URL and returns True if suitable for this IE."""
  78         return re.match(self._VALID_URL, url) is not None
  79
  80     def working(self):
  81         """Getter method for _WORKING."""
  82         return self._WORKING
  83
  84     def initialize(self):
  85         """Initializes an instance (authentication, etc)."""
  86         if not self._ready:
  87             self._real_initialize()
  88             self._ready = True
  89
  90     def extract(self, url):
  91         """Extracts URL information and returns it in list of dicts."""
  92         self.initialize()
  93         return self._real_extract(url)
  94
  95     def set_downloader(self, downloader):
  96         """Sets the downloader for this IE."""
  97         self._downloader = downloader
  98
  99     def _real_initialize(self):
 100         """Real initialization process. Redefine in subclasses."""
 101         pass
 102
 103     def _real_extract(self, url):
 104         """Real extraction process. Redefine in subclasses."""
 105         pass
 106
 107     @property
 108     def IE_NAME(self):
 109         return type(self).__name__[:-2]
 110
 111     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 112         """ Returns the response handle """
 113         if note is None:
 114             note = u'Downloading video webpage'
 115         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 116         try:
 117             return compat_urllib_request.urlopen(url_or_request)
 118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 119             if errnote is None:
 120                 errnote = u'Unable to download webpage'
 121             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 122
 123     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 124         """ Returns the data of the page as a string """
 125         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 126         webpage_bytes = urlh.read()
 127         return webpage_bytes.decode('utf-8', 'replace')
 128
 129
 130 class YoutubeIE(InfoExtractor):
 131     """Information extractor for youtube.com."""
 132
 133     _VALID_URL = r"""^
 134                      (
 135                          (?:https?://)?                                       # http(s):// (optional)
 136                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 137                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 138                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 139                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 140                          (?:                                                  # the various things that can precede the ID:
 141                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 142                              |(?:                                             # or the v= param in all its forms
 143                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 144                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 145                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 146                                  v=
 147                              )
 148                          )?                                                   # optional -> youtube.com/xxxx is OK
 149                      )?                                                       # all until now is optional -> you can pass the naked ID
 150                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 151                      (?(1).+)?                                                # if we found the ID, everything can follow
 152                      $"""
 153     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 154     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 155     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 156     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 157     _NETRC_MACHINE = 'youtube'
 158     # Listed in order of quality
 159     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 160     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 161     _video_extensions = {
 162         '13': '3gp',
 163         '17': 'mp4',
 164         '18': 'mp4',
 165         '22': 'mp4',
 166         '37': 'mp4',
 167         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 168         '43': 'webm',
 169         '44': 'webm',
 170         '45': 'webm',
 171         '46': 'webm',
 172     }
 173     _video_dimensions = {
 174         '5': '240x400',
 175         '6': '???',
 176         '13': '???',
 177         '17': '144x176',
 178         '18': '360x640',
 179         '22': '720x1280',
 180         '34': '360x640',
 181         '35': '480x854',
 182         '37': '1080x1920',
 183         '38': '3072x4096',
 184         '43': '360x640',
 185         '44': '480x854',
 186         '45': '720x1280',
 187         '46': '1080x1920',
 188     }
 189     IE_NAME = u'youtube'
 190
 191     def suitable(self, url):
 192         """Receives a URL and returns True if suitable for this IE."""
 193         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 194
 195     def report_lang(self):
 196         """Report attempt to set language."""
 197         self._downloader.to_screen(u'[youtube] Setting language')
 198
 199     def report_login(self):
 200         """Report attempt to log in."""
 201         self._downloader.to_screen(u'[youtube] Logging in')
 202
 203     def report_age_confirmation(self):
 204         """Report attempt to confirm age."""
 205         self._downloader.to_screen(u'[youtube] Confirming age')
 206
 207     def report_video_webpage_download(self, video_id):
 208         """Report attempt to download video webpage."""
 209         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 210
 211     def report_video_info_webpage_download(self, video_id):
 212         """Report attempt to download video info webpage."""
 213         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 214
 215     def report_video_subtitles_download(self, video_id):
 216         """Report attempt to download video info webpage."""
 217         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 218
 219     def report_information_extraction(self, video_id):
 220         """Report attempt to extract video information."""
 221         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 222
 223     def report_unavailable_format(self, video_id, format):
 224         """Report extracted video URL."""
 225         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 226
 227     def report_rtmp_download(self):
 228         """Indicate the download will use the RTMP protocol."""
 229         self._downloader.to_screen(u'[youtube] RTMP download detected')
 230
 231     def _closed_captions_xml_to_srt(self, xml_string):
 232         srt = ''
 233         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 234         # TODO parse xml instead of regex
 235         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 236             if not dur: dur = '4'
 237             start = float(start)
 238             end = start + float(dur)
 239             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 240             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 241             caption = unescapeHTML(caption)
 242             caption = unescapeHTML(caption) # double cycle, intentional
 243             srt += str(n+1) + '\n'
 244             srt += start + ' --> ' + end + '\n'
 245             srt += caption + '\n\n'
 246         return srt
 247
 248     def _extract_subtitles(self, video_id):
 249         self.report_video_subtitles_download(video_id)
 250         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 251         try:
 252             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 253         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 254             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 255         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 256         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 257         if not srt_lang_list:
 258             return (u'WARNING: video has no closed captions', None)
 259         if self._downloader.params.get('subtitleslang', False):
 260             srt_lang = self._downloader.params.get('subtitleslang')
 261         elif 'en' in srt_lang_list:
 262             srt_lang = 'en'
 263         else:
 264             srt_lang = list(srt_lang_list.keys())[0]
 265         if not srt_lang in srt_lang_list:
 266             return (u'WARNING: no closed captions found in the specified language', None)
 267         params = compat_urllib_parse.urlencode({
 268             'lang': srt_lang,
 269             'name': srt_lang_list[srt_lang].encode('utf-8'),
 270             'v': video_id,
 271         })
 272         url = 'http://www.youtube.com/api/timedtext?' + params
 273         try:
 274             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
 275         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 276             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 277         if not srt_xml:
 278             return (u'WARNING: Did not fetch video subtitles', None)
 279         return (None, self._closed_captions_xml_to_srt(srt_xml))
 280
 281     def _print_formats(self, formats):
 282         print('Available formats:')
 283         for x in formats:
 284             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 285
 286     def _real_initialize(self):
 287         if self._downloader is None:
 288             return
 289
 290         username = None
 291         password = None
 292         downloader_params = self._downloader.params
 293
 294         # Attempt to use provided username and password or .netrc data
 295         if downloader_params.get('username', None) is not None:
 296             username = downloader_params['username']
 297             password = downloader_params['password']
 298         elif downloader_params.get('usenetrc', False):
 299             try:
 300                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 301                 if info is not None:
 302                     username = info[0]
 303                     password = info[2]
 304                 else:
 305                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 306             except (IOError, netrc.NetrcParseError) as err:
 307                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 308                 return
 309
 310         # Set language
 311         request = compat_urllib_request.Request(self._LANG_URL)
 312         try:
 313             self.report_lang()
 314             compat_urllib_request.urlopen(request).read()
 315         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 316             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 317             return
 318
 319         # No authentication to be performed
 320         if username is None:
 321             return
 322
 323         request = compat_urllib_request.Request(self._LOGIN_URL)
 324         try:
 325             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 327             self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
 328             return
 329
 330         galx = None
 331         dsh = None
 332         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 333         if match:
 334           galx = match.group(1)
 335
 336         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 337         if match:
 338           dsh = match.group(1)
 339
 340         # Log in
 341         login_form_strs = {
 342                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 343                 u'Email': username,
 344                 u'GALX': galx,
 345                 u'Passwd': password,
 346                 u'PersistentCookie': u'yes',
 347                 u'_utf8': u'霱',
 348                 u'bgresponse': u'js_disabled',
 349                 u'checkConnection': u'',
 350                 u'checkedDomains': u'youtube',
 351                 u'dnConn': u'',
 352                 u'dsh': dsh,
 353                 u'pstMsg': u'0',
 354                 u'rmShown': u'1',
 355                 u'secTok': u'',
 356                 u'signIn': u'Sign in',
 357                 u'timeStmp': u'',
 358                 u'service': u'youtube',
 359                 u'uilel': u'3',
 360                 u'hl': u'en_US',
 361         }
 362         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 363         # chokes on unicode
 364         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 365         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 366         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 367         try:
 368             self.report_login()
 369             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 370             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 371                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 372                 return
 373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 374             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 375             return
 376
 377         # Confirm age
 378         age_form = {
 379                 'next_url':     '/',
 380                 'action_confirm':   'Confirm',
 381                 }
 382         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 383         try:
 384             self.report_age_confirmation()
 385             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 386         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 387             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 388             return
 389
 390     def _extract_id(self, url):
 391         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 392         if mobj is None:
 393             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 394             return
 395         video_id = mobj.group(2)
 396         return video_id
 397
 398     def _real_extract(self, url):
 399         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 400         mobj = re.search(self._NEXT_URL_RE, url)
 401         if mobj:
 402             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 403         video_id = self._extract_id(url)
 404
 405         # Get video webpage
 406         self.report_video_webpage_download(video_id)
 407         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 408         request = compat_urllib_request.Request(url)
 409         try:
 410             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 411         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 412             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 413             return
 414
 415         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 416
 417         # Attempt to extract SWF player URL
 418         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 419         if mobj is not None:
 420             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 421         else:
 422             player_url = None
 423
 424         # Get video info
 425         self.report_video_info_webpage_download(video_id)
 426         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 427             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 428                     % (video_id, el_type))
 429             request = compat_urllib_request.Request(video_info_url)
 430             try:
 431                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 432                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 433                 video_info = compat_parse_qs(video_info_webpage)
 434                 if 'token' in video_info:
 435                     break
 436             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 437                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 438                 return
 439         if 'token' not in video_info:
 440             if 'reason' in video_info:
 441                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 442             else:
 443                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 444             return
 445
 446         # Check for "rental" videos
 447         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 448             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 449             return
 450
 451         # Start extracting information
 452         self.report_information_extraction(video_id)
 453
 454         # uploader
 455         if 'author' not in video_info:
 456             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 457             return
 458         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 459
 460         # uploader_id
 461         video_uploader_id = None
 462         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 463         if mobj is not None:
 464             video_uploader_id = mobj.group(1)
 465         else:
 466             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 467
 468         # title
 469         if 'title' not in video_info:
 470             self._downloader.trouble(u'ERROR: unable to extract video title')
 471             return
 472         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 473
 474         # thumbnail image
 475         if 'thumbnail_url' not in video_info:
 476             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 477             video_thumbnail = ''
 478         else:   # don't panic if we can't find it
 479             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 480
 481         # upload date
 482         upload_date = None
 483         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 484         if mobj is not None:
 485             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 486             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 487             for expression in format_expressions:
 488                 try:
 489                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 490                 except:
 491                     pass
 492
 493         # description
 494         video_description = get_element_by_id("eow-description", video_webpage)
 495         if video_description:
 496             video_description = clean_html(video_description)
 497         else:
 498             video_description = ''
 499
 500         # closed captions
 501         video_subtitles = None
 502         if self._downloader.params.get('writesubtitles', False):
 503             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 504             if srt_error:
 505                 self._downloader.trouble(srt_error)
 506
 507         if 'length_seconds' not in video_info:
 508             self._downloader.trouble(u'WARNING: unable to extract video duration')
 509             video_duration = ''
 510         else:
 511             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 512
 513         # token
 514         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 515
 516         # Decide which formats to download
 517         req_format = self._downloader.params.get('format', None)
 518
 519         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 520             self.report_rtmp_download()
 521             video_url_list = [(None, video_info['conn'][0])]
 522         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 523             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 524             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 525             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 526             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 527
 528             format_limit = self._downloader.params.get('format_limit', None)
 529             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 530             if format_limit is not None and format_limit in available_formats:
 531                 format_list = available_formats[available_formats.index(format_limit):]
 532             else:
 533                 format_list = available_formats
 534             existing_formats = [x for x in format_list if x in url_map]
 535             if len(existing_formats) == 0:
 536                 self._downloader.trouble(u'ERROR: no known formats available for video')
 537                 return
 538             if self._downloader.params.get('listformats', None):
 539                 self._print_formats(existing_formats)
 540                 return
 541             if req_format is None or req_format == 'best':
 542                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 543             elif req_format == 'worst':
 544                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 545             elif req_format in ('-1', 'all'):
 546                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 547             else:
 548                 # Specific formats. We pick the first in a slash-delimeted sequence.
 549                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 550                 req_formats = req_format.split('/')
 551                 video_url_list = None
 552                 for rf in req_formats:
 553                     if rf in url_map:
 554                         video_url_list = [(rf, url_map[rf])]
 555                         break
 556                 if video_url_list is None:
 557                     self._downloader.trouble(u'ERROR: requested format not available')
 558                     return
 559         else:
 560             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 561             return
 562
 563         results = []
 564         for format_param, video_real_url in video_url_list:
 565             # Extension
 566             video_extension = self._video_extensions.get(format_param, 'flv')
 567
 568             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 569                                               self._video_dimensions.get(format_param, '???'))
 570
 571             results.append({
 572                 'id':       video_id,
 573                 'url':      video_real_url,
 574                 'uploader': video_uploader,
 575                 'uploader_id': video_uploader_id,
 576                 'upload_date':  upload_date,
 577                 'title':    video_title,
 578                 'ext':      video_extension,
 579                 'format':   video_format,
 580                 'thumbnail':    video_thumbnail,
 581                 'description':  video_description,
 582                 'player_url':   player_url,
 583                 'subtitles':    video_subtitles,
 584                 'duration':     video_duration
 585             })
 586         return results
 587
 588
 589 class MetacafeIE(InfoExtractor):
 590     """Information Extractor for metacafe.com."""
 591
 592     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 593     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 594     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 595     IE_NAME = u'metacafe'
 596
 597     def __init__(self, downloader=None):
 598         InfoExtractor.__init__(self, downloader)
 599
 600     def report_disclaimer(self):
 601         """Report disclaimer retrieval."""
 602         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 603
 604     def report_age_confirmation(self):
 605         """Report attempt to confirm age."""
 606         self._downloader.to_screen(u'[metacafe] Confirming age')
 607
 608     def report_download_webpage(self, video_id):
 609         """Report webpage download."""
 610         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 611
 612     def report_extraction(self, video_id):
 613         """Report information extraction."""
 614         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 615
 616     def _real_initialize(self):
 617         # Retrieve disclaimer
 618         request = compat_urllib_request.Request(self._DISCLAIMER)
 619         try:
 620             self.report_disclaimer()
 621             disclaimer = compat_urllib_request.urlopen(request).read()
 622         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 623             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 624             return
 625
 626         # Confirm age
 627         disclaimer_form = {
 628             'filters': '0',
 629             'submit': "Continue - I'm over 18",
 630             }
 631         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 632         try:
 633             self.report_age_confirmation()
 634             disclaimer = compat_urllib_request.urlopen(request).read()
 635         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 636             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 637             return
 638
 639     def _real_extract(self, url):
 640         # Extract id and simplified title from URL
 641         mobj = re.match(self._VALID_URL, url)
 642         if mobj is None:
 643             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 644             return
 645
 646         video_id = mobj.group(1)
 647
 648         # Check if video comes from YouTube
 649         mobj2 = re.match(r'^yt-(.*)$', video_id)
 650         if mobj2 is not None:
 651             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 652             return
 653
 654         # Retrieve video webpage to extract further information
 655         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 656         try:
 657             self.report_download_webpage(video_id)
 658             webpage = compat_urllib_request.urlopen(request).read()
 659         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 660             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 661             return
 662
 663         # Extract URL, uploader and title from webpage
 664         self.report_extraction(video_id)
 665         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 666         if mobj is not None:
 667             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 668             video_extension = mediaURL[-3:]
 669
 670             # Extract gdaKey if available
 671             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 672             if mobj is None:
 673                 video_url = mediaURL
 674             else:
 675                 gdaKey = mobj.group(1)
 676                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 677         else:
 678             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 679             if mobj is None:
 680                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 681                 return
 682             vardict = compat_parse_qs(mobj.group(1))
 683             if 'mediaData' not in vardict:
 684                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 685                 return
 686             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 687             if mobj is None:
 688                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 689                 return
 690             mediaURL = mobj.group(1).replace('\\/', '/')
 691             video_extension = mediaURL[-3:]
 692             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 693
 694         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 695         if mobj is None:
 696             self._downloader.trouble(u'ERROR: unable to extract title')
 697             return
 698         video_title = mobj.group(1).decode('utf-8')
 699
 700         mobj = re.search(r'submitter=(.*?);', webpage)
 701         if mobj is None:
 702             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 703             return
 704         video_uploader = mobj.group(1)
 705
 706         return [{
 707             'id':       video_id.decode('utf-8'),
 708             'url':      video_url.decode('utf-8'),
 709             'uploader': video_uploader.decode('utf-8'),
 710             'upload_date':  None,
 711             'title':    video_title,
 712             'ext':      video_extension.decode('utf-8'),
 713         }]
 714
 715
 716 class DailymotionIE(InfoExtractor):
 717     """Information Extractor for Dailymotion"""
 718
 719     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 720     IE_NAME = u'dailymotion'
 721     _WORKING = False
 722
 723     def __init__(self, downloader=None):
 724         InfoExtractor.__init__(self, downloader)
 725
 726     def report_extraction(self, video_id):
 727         """Report information extraction."""
 728         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 729
 730     def _real_extract(self, url):
 731         # Extract id and simplified title from URL
 732         mobj = re.match(self._VALID_URL, url)
 733         if mobj is None:
 734             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 735             return
 736
 737         video_id = mobj.group(1).split('_')[0].split('?')[0]
 738
 739         video_extension = 'mp4'
 740
 741         # Retrieve video webpage to extract further information
 742         request = compat_urllib_request.Request(url)
 743         request.add_header('Cookie', 'family_filter=off')
 744         webpage = self._download_webpage(request, video_id)
 745
 746         # Extract URL, uploader and title from webpage
 747         self.report_extraction(video_id)
 748         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 749         if mobj is None:
 750             self._downloader.trouble(u'ERROR: unable to extract media URL')
 751             return
 752         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 753
 754         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 755             if key in flashvars:
 756                 max_quality = key
 757                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 758                 break
 759         else:
 760             self._downloader.trouble(u'ERROR: unable to extract video URL')
 761             return
 762
 763         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 764         if mobj is None:
 765             self._downloader.trouble(u'ERROR: unable to extract video URL')
 766             return
 767
 768         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 769
 770         # TODO: support choosing qualities
 771
 772         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 773         if mobj is None:
 774             self._downloader.trouble(u'ERROR: unable to extract title')
 775             return
 776         video_title = unescapeHTML(mobj.group('title'))
 777
 778         video_uploader = None
 779         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 780         if mobj is None:
 781             # lookin for official user
 782             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 783             if mobj_official is None:
 784                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 785             else:
 786                 video_uploader = mobj_official.group(1)
 787         else:
 788             video_uploader = mobj.group(1)
 789
 790         video_upload_date = None
 791         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 792         if mobj is not None:
 793             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 794
 795         return [{
 796             'id':       video_id,
 797             'url':      video_url,
 798             'uploader': video_uploader,
 799             'upload_date':  video_upload_date,
 800             'title':    video_title,
 801             'ext':      video_extension,
 802         }]
 803
 804
 805 class PhotobucketIE(InfoExtractor):
 806     """Information extractor for photobucket.com."""
 807
 808     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 809     IE_NAME = u'photobucket'
 810
 811     def __init__(self, downloader=None):
 812         InfoExtractor.__init__(self, downloader)
 813
 814     def report_download_webpage(self, video_id):
 815         """Report webpage download."""
 816         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 817
 818     def report_extraction(self, video_id):
 819         """Report information extraction."""
 820         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 821
 822     def _real_extract(self, url):
 823         # Extract id from URL
 824         mobj = re.match(self._VALID_URL, url)
 825         if mobj is None:
 826             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 827             return
 828
 829         video_id = mobj.group(1)
 830
 831         video_extension = 'flv'
 832
 833         # Retrieve video webpage to extract further information
 834         request = compat_urllib_request.Request(url)
 835         try:
 836             self.report_download_webpage(video_id)
 837             webpage = compat_urllib_request.urlopen(request).read()
 838         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 839             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 840             return
 841
 842         # Extract URL, uploader, and title from webpage
 843         self.report_extraction(video_id)
 844         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 845         if mobj is None:
 846             self._downloader.trouble(u'ERROR: unable to extract media URL')
 847             return
 848         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 849
 850         video_url = mediaURL
 851
 852         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 853         if mobj is None:
 854             self._downloader.trouble(u'ERROR: unable to extract title')
 855             return
 856         video_title = mobj.group(1).decode('utf-8')
 857
 858         video_uploader = mobj.group(2).decode('utf-8')
 859
 860         return [{
 861             'id':       video_id.decode('utf-8'),
 862             'url':      video_url.decode('utf-8'),
 863             'uploader': video_uploader,
 864             'upload_date':  None,
 865             'title':    video_title,
 866             'ext':      video_extension.decode('utf-8'),
 867         }]
 868
 869
 870 class YahooIE(InfoExtractor):
 871     """Information extractor for video.yahoo.com."""
 872
 873     _WORKING = False
 874     # _VALID_URL matches all Yahoo! Video URLs
 875     # _VPAGE_URL matches only the extractable '/watch/' URLs
 876     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 877     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 878     IE_NAME = u'video.yahoo'
 879
 880     def __init__(self, downloader=None):
 881         InfoExtractor.__init__(self, downloader)
 882
 883     def report_download_webpage(self, video_id):
 884         """Report webpage download."""
 885         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 886
 887     def report_extraction(self, video_id):
 888         """Report information extraction."""
 889         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 890
 891     def _real_extract(self, url, new_video=True):
 892         # Extract ID from URL
 893         mobj = re.match(self._VALID_URL, url)
 894         if mobj is None:
 895             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 896             return
 897
 898         video_id = mobj.group(2)
 899         video_extension = 'flv'
 900
 901         # Rewrite valid but non-extractable URLs as
 902         # extractable English language /watch/ URLs
 903         if re.match(self._VPAGE_URL, url) is None:
 904             request = compat_urllib_request.Request(url)
 905             try:
 906                 webpage = compat_urllib_request.urlopen(request).read()
 907             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 908                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 909                 return
 910
 911             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 912             if mobj is None:
 913                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 914                 return
 915             yahoo_id = mobj.group(1)
 916
 917             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 918             if mobj is None:
 919                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 920                 return
 921             yahoo_vid = mobj.group(1)
 922
 923             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 924             return self._real_extract(url, new_video=False)
 925
 926         # Retrieve video webpage to extract further information
 927         request = compat_urllib_request.Request(url)
 928         try:
 929             self.report_download_webpage(video_id)
 930             webpage = compat_urllib_request.urlopen(request).read()
 931         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 932             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 933             return
 934
 935         # Extract uploader and title from webpage
 936         self.report_extraction(video_id)
 937         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 938         if mobj is None:
 939             self._downloader.trouble(u'ERROR: unable to extract video title')
 940             return
 941         video_title = mobj.group(1).decode('utf-8')
 942
 943         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 944         if mobj is None:
 945             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 946             return
 947         video_uploader = mobj.group(1).decode('utf-8')
 948
 949         # Extract video thumbnail
 950         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 951         if mobj is None:
 952             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 953             return
 954         video_thumbnail = mobj.group(1).decode('utf-8')
 955
 956         # Extract video description
 957         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 958         if mobj is None:
 959             self._downloader.trouble(u'ERROR: unable to extract video description')
 960             return
 961         video_description = mobj.group(1).decode('utf-8')
 962         if not video_description:
 963             video_description = 'No description available.'
 964
 965         # Extract video height and width
 966         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 967         if mobj is None:
 968             self._downloader.trouble(u'ERROR: unable to extract video height')
 969             return
 970         yv_video_height = mobj.group(1)
 971
 972         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 973         if mobj is None:
 974             self._downloader.trouble(u'ERROR: unable to extract video width')
 975             return
 976         yv_video_width = mobj.group(1)
 977
 978         # Retrieve video playlist to extract media URL
 979         # I'm not completely sure what all these options are, but we
 980         # seem to need most of them, otherwise the server sends a 401.
 981         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 982         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 983         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 984                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 985                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 986         try:
 987             self.report_download_webpage(video_id)
 988             webpage = compat_urllib_request.urlopen(request).read()
 989         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 990             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 991             return
 992
 993         # Extract media URL from playlist XML
 994         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 995         if mobj is None:
 996             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 997             return
 998         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 999         video_url = unescapeHTML(video_url)
1000
1001         return [{
1002             'id':       video_id.decode('utf-8'),
1003             'url':      video_url,
1004             'uploader': video_uploader,
1005             'upload_date':  None,
1006             'title':    video_title,
1007             'ext':      video_extension.decode('utf-8'),
1008             'thumbnail':    video_thumbnail.decode('utf-8'),
1009             'description':  video_description,
1010         }]
1011
1012
1013 class VimeoIE(InfoExtractor):
1014     """Information extractor for vimeo.com."""
1015
1016     # _VALID_URL matches Vimeo URLs
1017     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1018     IE_NAME = u'vimeo'
1019
1020     def __init__(self, downloader=None):
1021         InfoExtractor.__init__(self, downloader)
1022
1023     def report_download_webpage(self, video_id):
1024         """Report webpage download."""
1025         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1026
1027     def report_extraction(self, video_id):
1028         """Report information extraction."""
1029         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1030
1031     def _real_extract(self, url, new_video=True):
1032         # Extract ID from URL
1033         mobj = re.match(self._VALID_URL, url)
1034         if mobj is None:
1035             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1036             return
1037
1038         video_id = mobj.group('id')
1039         if not mobj.group('proto'):
1040             url = 'https://' + url
1041         if mobj.group('direct_link'):
1042             url = 'https://vimeo.com/' + video_id
1043
1044         # Retrieve video webpage to extract further information
1045         request = compat_urllib_request.Request(url, None, std_headers)
1046         try:
1047             self.report_download_webpage(video_id)
1048             webpage_bytes = compat_urllib_request.urlopen(request).read()
1049             webpage = webpage_bytes.decode('utf-8')
1050         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1051             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1052             return
1053
1054         # Now we begin extracting as much information as we can from what we
1055         # retrieved. First we extract the information common to all extractors,
1056         # and latter we extract those that are Vimeo specific.
1057         self.report_extraction(video_id)
1058
1059         # Extract the config JSON
1060         try:
1061             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1062             config = json.loads(config)
1063         except:
1064             self._downloader.trouble(u'ERROR: unable to extract info section')
1065             return
1066
1067         # Extract title
1068         video_title = config["video"]["title"]
1069
1070         # Extract uploader and uploader_id
1071         video_uploader = config["video"]["owner"]["name"]
1072         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1073
1074         # Extract video thumbnail
1075         video_thumbnail = config["video"]["thumbnail"]
1076
1077         # Extract video description
1078         video_description = get_element_by_attribute("itemprop", "description", webpage)
1079         if video_description: video_description = clean_html(video_description)
1080         else: video_description = ''
1081
1082         # Extract upload date
1083         video_upload_date = None
1084         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1085         if mobj is not None:
1086             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1087
1088         # Vimeo specific: extract request signature and timestamp
1089         sig = config['request']['signature']
1090         timestamp = config['request']['timestamp']
1091
1092         # Vimeo specific: extract video codec and quality information
1093         # First consider quality, then codecs, then take everything
1094         # TODO bind to format param
1095         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1096         files = { 'hd': [], 'sd': [], 'other': []}
1097         for codec_name, codec_extension in codecs:
1098             if codec_name in config["video"]["files"]:
1099                 if 'hd' in config["video"]["files"][codec_name]:
1100                     files['hd'].append((codec_name, codec_extension, 'hd'))
1101                 elif 'sd' in config["video"]["files"][codec_name]:
1102                     files['sd'].append((codec_name, codec_extension, 'sd'))
1103                 else:
1104                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1105
1106         for quality in ('hd', 'sd', 'other'):
1107             if len(files[quality]) > 0:
1108                 video_quality = files[quality][0][2]
1109                 video_codec = files[quality][0][0]
1110                 video_extension = files[quality][0][1]
1111                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1112                 break
1113         else:
1114             self._downloader.trouble(u'ERROR: no known codec found')
1115             return
1116
1117         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1118                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1119
1120         return [{
1121             'id':       video_id,
1122             'url':      video_url,
1123             'uploader': video_uploader,
1124             'uploader_id': video_uploader_id,
1125             'upload_date':  video_upload_date,
1126             'title':    video_title,
1127             'ext':      video_extension,
1128             'thumbnail':    video_thumbnail,
1129             'description':  video_description,
1130         }]
1131
1132
1133 class ArteTvIE(InfoExtractor):
1134     """arte.tv information extractor."""
1135
1136     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1137     _LIVE_URL = r'index-[0-9]+\.html$'
1138
1139     IE_NAME = u'arte.tv'
1140
1141     def __init__(self, downloader=None):
1142         InfoExtractor.__init__(self, downloader)
1143
1144     def report_download_webpage(self, video_id):
1145         """Report webpage download."""
1146         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1147
1148     def report_extraction(self, video_id):
1149         """Report information extraction."""
1150         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1151
1152     def fetch_webpage(self, url):
1153         request = compat_urllib_request.Request(url)
1154         try:
1155             self.report_download_webpage(url)
1156             webpage = compat_urllib_request.urlopen(request).read()
1157         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1158             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1159             return
1160         except ValueError as err:
1161             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1162             return
1163         return webpage
1164
1165     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1166         page = self.fetch_webpage(url)
1167         mobj = re.search(regex, page, regexFlags)
1168         info = {}
1169
1170         if mobj is None:
1171             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1172             return
1173
1174         for (i, key, err) in matchTuples:
1175             if mobj.group(i) is None:
1176                 self._downloader.trouble(err)
1177                 return
1178             else:
1179                 info[key] = mobj.group(i)
1180
1181         return info
1182
1183     def extractLiveStream(self, url):
1184         video_lang = url.split('/')[-4]
1185         info = self.grep_webpage(
1186             url,
1187             r'src="(.*?/videothek_js.*?\.js)',
1188             0,
1189             [
1190                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1191             ]
1192         )
1193         http_host = url.split('/')[2]
1194         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1195         info = self.grep_webpage(
1196             next_url,
1197             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1198                 '(http://.*?\.swf).*?' +
1199                 '(rtmp://.*?)\'',
1200             re.DOTALL,
1201             [
1202                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1203                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1204                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1205             ]
1206         )
1207         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1208
1209     def extractPlus7Stream(self, url):
1210         video_lang = url.split('/')[-3]
1211         info = self.grep_webpage(
1212             url,
1213             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1214             0,
1215             [
1216                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1217             ]
1218         )
1219         next_url = compat_urllib_parse.unquote(info.get('url'))
1220         info = self.grep_webpage(
1221             next_url,
1222             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1223             0,
1224             [
1225                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1226             ]
1227         )
1228         next_url = compat_urllib_parse.unquote(info.get('url'))
1229
1230         info = self.grep_webpage(
1231             next_url,
1232             r'<video id="(.*?)".*?>.*?' +
1233                 '<name>(.*?)</name>.*?' +
1234                 '<dateVideo>(.*?)</dateVideo>.*?' +
1235                 '<url quality="hd">(.*?)</url>',
1236             re.DOTALL,
1237             [
1238                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1239                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1240                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1241                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1242             ]
1243         )
1244
1245         return {
1246             'id':           info.get('id'),
1247             'url':          compat_urllib_parse.unquote(info.get('url')),
1248             'uploader':     u'arte.tv',
1249             'upload_date':  info.get('date'),
1250             'title':        info.get('title').decode('utf-8'),
1251             'ext':          u'mp4',
1252             'format':       u'NA',
1253             'player_url':   None,
1254         }
1255
1256     def _real_extract(self, url):
1257         video_id = url.split('/')[-1]
1258         self.report_extraction(video_id)
1259
1260         if re.search(self._LIVE_URL, video_id) is not None:
1261             self.extractLiveStream(url)
1262             return
1263         else:
1264             info = self.extractPlus7Stream(url)
1265
1266         return [info]
1267
1268
1269 class GenericIE(InfoExtractor):
1270     """Generic last-resort information extractor."""
1271
1272     _VALID_URL = r'.*'
1273     IE_NAME = u'generic'
1274
1275     def __init__(self, downloader=None):
1276         InfoExtractor.__init__(self, downloader)
1277
1278     def report_download_webpage(self, video_id):
1279         """Report webpage download."""
1280         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1281         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1282
1283     def report_extraction(self, video_id):
1284         """Report information extraction."""
1285         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1286
1287     def report_following_redirect(self, new_url):
1288         """Report information extraction."""
1289         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1290
1291     def _test_redirect(self, url):
1292         """Check if it is a redirect, like url shorteners, in case restart chain."""
1293         class HeadRequest(compat_urllib_request.Request):
1294             def get_method(self):
1295                 return "HEAD"
1296
1297         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1298             """
1299             Subclass the HTTPRedirectHandler to make it use our
1300             HeadRequest also on the redirected URL
1301             """
1302             def redirect_request(self, req, fp, code, msg, headers, newurl):
1303                 if code in (301, 302, 303, 307):
1304                     newurl = newurl.replace(' ', '%20')
1305                     newheaders = dict((k,v) for k,v in req.headers.items()
1306                                       if k.lower() not in ("content-length", "content-type"))
1307                     return HeadRequest(newurl,
1308                                        headers=newheaders,
1309                                        origin_req_host=req.get_origin_req_host(),
1310                                        unverifiable=True)
1311                 else:
1312                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1313
1314         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1315             """
1316             Fallback to GET if HEAD is not allowed (405 HTTP error)
1317             """
1318             def http_error_405(self, req, fp, code, msg, headers):
1319                 fp.read()
1320                 fp.close()
1321
1322                 newheaders = dict((k,v) for k,v in req.headers.items()
1323                                   if k.lower() not in ("content-length", "content-type"))
1324                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1325                                                  headers=newheaders,
1326                                                  origin_req_host=req.get_origin_req_host(),
1327                                                  unverifiable=True))
1328
1329         # Build our opener
1330         opener = compat_urllib_request.OpenerDirector()
1331         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1332                         HTTPMethodFallback, HEADRedirectHandler,
1333                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1334             opener.add_handler(handler())
1335
1336         response = opener.open(HeadRequest(url))
1337         new_url = response.geturl()
1338
1339         if url == new_url:
1340             return False
1341
1342         self.report_following_redirect(new_url)
1343         self._downloader.download([new_url])
1344         return True
1345
1346     def _real_extract(self, url):
1347         if self._test_redirect(url): return
1348
1349         video_id = url.split('/')[-1]
1350         request = compat_urllib_request.Request(url)
1351         try:
1352             self.report_download_webpage(video_id)
1353             webpage = compat_urllib_request.urlopen(request).read()
1354         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1355             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1356             return
1357         except ValueError as err:
1358             # since this is the last-resort InfoExtractor, if
1359             # this error is thrown, it'll be thrown here
1360             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1361             return
1362
1363         self.report_extraction(video_id)
1364         # Start with something easy: JW Player in SWFObject
1365         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1366         if mobj is None:
1367             # Broaden the search a little bit
1368             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1369         if mobj is None:
1370             # Broaden the search a little bit: JWPlayer JS loader
1371             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1372         if mobj is None:
1373             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1374             return
1375
1376         # It's possible that one of the regexes
1377         # matched, but returned an empty group:
1378         if mobj.group(1) is None:
1379             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1380             return
1381
1382         video_url = compat_urllib_parse.unquote(mobj.group(1))
1383         video_id = os.path.basename(video_url)
1384
1385         # here's a fun little line of code for you:
1386         video_extension = os.path.splitext(video_id)[1][1:]
1387         video_id = os.path.splitext(video_id)[0]
1388
1389         # it's tempting to parse this further, but you would
1390         # have to take into account all the variations like
1391         #   Video Title - Site Name
1392         #   Site Name | Video Title
1393         #   Video Title - Tagline | Site Name
1394         # and so on and so forth; it's just not practical
1395         mobj = re.search(r'<title>(.*)</title>', webpage)
1396         if mobj is None:
1397             self._downloader.trouble(u'ERROR: unable to extract title')
1398             return
1399         video_title = mobj.group(1)
1400
1401         # video uploader is domain name
1402         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1403         if mobj is None:
1404             self._downloader.trouble(u'ERROR: unable to extract title')
1405             return
1406         video_uploader = mobj.group(1)
1407
1408         return [{
1409             'id':       video_id,
1410             'url':      video_url,
1411             'uploader': video_uploader,
1412             'upload_date':  None,
1413             'title':    video_title,
1414             'ext':      video_extension,
1415         }]
1416
1417
1418 class YoutubeSearchIE(InfoExtractor):
1419     """Information Extractor for YouTube search queries."""
1420     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1421     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1422     _max_youtube_results = 1000
1423     IE_NAME = u'youtube:search'
1424
1425     def __init__(self, downloader=None):
1426         InfoExtractor.__init__(self, downloader)
1427
1428     def report_download_page(self, query, pagenum):
1429         """Report attempt to download search page with given number."""
1430         query = query.decode(preferredencoding())
1431         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1432
1433     def _real_extract(self, query):
1434         mobj = re.match(self._VALID_URL, query)
1435         if mobj is None:
1436             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1437             return
1438
1439         prefix, query = query.split(':')
1440         prefix = prefix[8:]
1441         query = query.encode('utf-8')
1442         if prefix == '':
1443             self._download_n_results(query, 1)
1444             return
1445         elif prefix == 'all':
1446             self._download_n_results(query, self._max_youtube_results)
1447             return
1448         else:
1449             try:
1450                 n = int(prefix)
1451                 if n <= 0:
1452                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1453                     return
1454                 elif n > self._max_youtube_results:
1455                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1456                     n = self._max_youtube_results
1457                 self._download_n_results(query, n)
1458                 return
1459             except ValueError: # parsing prefix as integer fails
1460                 self._download_n_results(query, 1)
1461                 return
1462
1463     def _download_n_results(self, query, n):
1464         """Downloads a specified number of results for a query"""
1465
1466         video_ids = []
1467         pagenum = 0
1468         limit = n
1469
1470         while (50 * pagenum) < limit:
1471             self.report_download_page(query, pagenum+1)
1472             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1473             request = compat_urllib_request.Request(result_url)
1474             try:
1475                 data = compat_urllib_request.urlopen(request).read()
1476             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1477                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1478                 return
1479             api_response = json.loads(data)['data']
1480
1481             new_ids = list(video['id'] for video in api_response['items'])
1482             video_ids += new_ids
1483
1484             limit = min(n, api_response['totalItems'])
1485             pagenum += 1
1486
1487         if len(video_ids) > n:
1488             video_ids = video_ids[:n]
1489         for id in video_ids:
1490             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1491         return
1492
1493
1494 class GoogleSearchIE(InfoExtractor):
1495     """Information Extractor for Google Video search queries."""
1496     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1497     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1498     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1499     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1500     _max_google_results = 1000
1501     IE_NAME = u'video.google:search'
1502
1503     def __init__(self, downloader=None):
1504         InfoExtractor.__init__(self, downloader)
1505
1506     def report_download_page(self, query, pagenum):
1507         """Report attempt to download playlist page with given number."""
1508         query = query.decode(preferredencoding())
1509         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1510
1511     def _real_extract(self, query):
1512         mobj = re.match(self._VALID_URL, query)
1513         if mobj is None:
1514             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1515             return
1516
1517         prefix, query = query.split(':')
1518         prefix = prefix[8:]
1519         query = query.encode('utf-8')
1520         if prefix == '':
1521             self._download_n_results(query, 1)
1522             return
1523         elif prefix == 'all':
1524             self._download_n_results(query, self._max_google_results)
1525             return
1526         else:
1527             try:
1528                 n = int(prefix)
1529                 if n <= 0:
1530                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1531                     return
1532                 elif n > self._max_google_results:
1533                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1534                     n = self._max_google_results
1535                 self._download_n_results(query, n)
1536                 return
1537             except ValueError: # parsing prefix as integer fails
1538                 self._download_n_results(query, 1)
1539                 return
1540
1541     def _download_n_results(self, query, n):
1542         """Downloads a specified number of results for a query"""
1543
1544         video_ids = []
1545         pagenum = 0
1546
1547         while True:
1548             self.report_download_page(query, pagenum)
1549             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1550             request = compat_urllib_request.Request(result_url)
1551             try:
1552                 page = compat_urllib_request.urlopen(request).read()
1553             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1554                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1555                 return
1556
1557             # Extract video identifiers
1558             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1559                 video_id = mobj.group(1)
1560                 if video_id not in video_ids:
1561                     video_ids.append(video_id)
1562                     if len(video_ids) == n:
1563                         # Specified n videos reached
1564                         for id in video_ids:
1565                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1566                         return
1567
1568             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1569                 for id in video_ids:
1570                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1571                 return
1572
1573             pagenum = pagenum + 1
1574
1575
1576 class YahooSearchIE(InfoExtractor):
1577     """Information Extractor for Yahoo! Video search queries."""
1578
1579     _WORKING = False
1580     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1581     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1582     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1583     _MORE_PAGES_INDICATOR = r'\s*Next'
1584     _max_yahoo_results = 1000
1585     IE_NAME = u'video.yahoo:search'
1586
1587     def __init__(self, downloader=None):
1588         InfoExtractor.__init__(self, downloader)
1589
1590     def report_download_page(self, query, pagenum):
1591         """Report attempt to download playlist page with given number."""
1592         query = query.decode(preferredencoding())
1593         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1594
1595     def _real_extract(self, query):
1596         mobj = re.match(self._VALID_URL, query)
1597         if mobj is None:
1598             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1599             return
1600
1601         prefix, query = query.split(':')
1602         prefix = prefix[8:]
1603         query = query.encode('utf-8')
1604         if prefix == '':
1605             self._download_n_results(query, 1)
1606             return
1607         elif prefix == 'all':
1608             self._download_n_results(query, self._max_yahoo_results)
1609             return
1610         else:
1611             try:
1612                 n = int(prefix)
1613                 if n <= 0:
1614                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1615                     return
1616                 elif n > self._max_yahoo_results:
1617                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1618                     n = self._max_yahoo_results
1619                 self._download_n_results(query, n)
1620                 return
1621             except ValueError: # parsing prefix as integer fails
1622                 self._download_n_results(query, 1)
1623                 return
1624
1625     def _download_n_results(self, query, n):
1626         """Downloads a specified number of results for a query"""
1627
1628         video_ids = []
1629         already_seen = set()
1630         pagenum = 1
1631
1632         while True:
1633             self.report_download_page(query, pagenum)
1634             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1635             request = compat_urllib_request.Request(result_url)
1636             try:
1637                 page = compat_urllib_request.urlopen(request).read()
1638             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1639                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1640                 return
1641
1642             # Extract video identifiers
1643             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1644                 video_id = mobj.group(1)
1645                 if video_id not in already_seen:
1646                     video_ids.append(video_id)
1647                     already_seen.add(video_id)
1648                     if len(video_ids) == n:
1649                         # Specified n videos reached
1650                         for id in video_ids:
1651                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1652                         return
1653
1654             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1655                 for id in video_ids:
1656                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1657                 return
1658
1659             pagenum = pagenum + 1
1660
1661
1662 class YoutubePlaylistIE(InfoExtractor):
1663     """Information Extractor for YouTube playlists."""
1664
1665     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1666     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1667     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1668     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1669     IE_NAME = u'youtube:playlist'
1670
1671     def __init__(self, downloader=None):
1672         InfoExtractor.__init__(self, downloader)
1673
1674     def report_download_page(self, playlist_id, pagenum):
1675         """Report attempt to download playlist page with given number."""
1676         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1677
1678     def _real_extract(self, url):
1679         # Extract playlist id
1680         mobj = re.match(self._VALID_URL, url)
1681         if mobj is None:
1682             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1683             return
1684
1685         # Single video case
1686         if mobj.group(3) is not None:
1687             self._downloader.download([mobj.group(3)])
1688             return
1689
1690         # Download playlist pages
1691         # prefix is 'p' as default for playlists but there are other types that need extra care
1692         playlist_prefix = mobj.group(1)
1693         if playlist_prefix == 'a':
1694             playlist_access = 'artist'
1695         else:
1696             playlist_prefix = 'p'
1697             playlist_access = 'view_play_list'
1698         playlist_id = mobj.group(2)
1699         video_ids = []
1700         pagenum = 1
1701
1702         while True:
1703             self.report_download_page(playlist_id, pagenum)
1704             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1705             request = compat_urllib_request.Request(url)
1706             try:
1707                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1708             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1709                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1710                 return
1711
1712             # Extract video identifiers
1713             ids_in_page = []
1714             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1715                 if mobj.group(1) not in ids_in_page:
1716                     ids_in_page.append(mobj.group(1))
1717             video_ids.extend(ids_in_page)
1718
1719             if self._MORE_PAGES_INDICATOR not in page:
1720                 break
1721             pagenum = pagenum + 1
1722
1723         total = len(video_ids)
1724
1725         playliststart = self._downloader.params.get('playliststart', 1) - 1
1726         playlistend = self._downloader.params.get('playlistend', -1)
1727         if playlistend == -1:
1728             video_ids = video_ids[playliststart:]
1729         else:
1730             video_ids = video_ids[playliststart:playlistend]
1731
1732         if len(video_ids) == total:
1733             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1734         else:
1735             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1736
1737         for id in video_ids:
1738             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1739         return
1740
1741
1742 class YoutubeChannelIE(InfoExtractor):
1743     """Information Extractor for YouTube channels."""
1744
1745     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1746     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1747     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1748     IE_NAME = u'youtube:channel'
1749
1750     def report_download_page(self, channel_id, pagenum):
1751         """Report attempt to download channel page with given number."""
1752         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1753
1754     def _real_extract(self, url):
1755         # Extract channel id
1756         mobj = re.match(self._VALID_URL, url)
1757         if mobj is None:
1758             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1759             return
1760
1761         # Download channel pages
1762         channel_id = mobj.group(1)
1763         video_ids = []
1764         pagenum = 1
1765
1766         while True:
1767             self.report_download_page(channel_id, pagenum)
1768             url = self._TEMPLATE_URL % (channel_id, pagenum)
1769             request = compat_urllib_request.Request(url)
1770             try:
1771                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1772             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1773                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1774                 return
1775
1776             # Extract video identifiers
1777             ids_in_page = []
1778             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1779                 if mobj.group(1) not in ids_in_page:
1780                     ids_in_page.append(mobj.group(1))
1781             video_ids.extend(ids_in_page)
1782
1783             if self._MORE_PAGES_INDICATOR not in page:
1784                 break
1785             pagenum = pagenum + 1
1786
1787         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1788
1789         for id in video_ids:
1790             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1791         return
1792
1793
1794 class YoutubeUserIE(InfoExtractor):
1795     """Information Extractor for YouTube users."""
1796
1797     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1798     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1799     _GDATA_PAGE_SIZE = 50
1800     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1801     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1802     IE_NAME = u'youtube:user'
1803
1804     def __init__(self, downloader=None):
1805         InfoExtractor.__init__(self, downloader)
1806
1807     def report_download_page(self, username, start_index):
1808         """Report attempt to download user page."""
1809         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1810                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1811
1812     def _real_extract(self, url):
1813         # Extract username
1814         mobj = re.match(self._VALID_URL, url)
1815         if mobj is None:
1816             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1817             return
1818
1819         username = mobj.group(1)
1820
1821         # Download video ids using YouTube Data API. Result size per
1822         # query is limited (currently to 50 videos) so we need to query
1823         # page by page until there are no video ids - it means we got
1824         # all of them.
1825
1826         video_ids = []
1827         pagenum = 0
1828
1829         while True:
1830             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1831             self.report_download_page(username, start_index)
1832
1833             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1834
1835             try:
1836                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1837             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1838                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1839                 return
1840
1841             # Extract video identifiers
1842             ids_in_page = []
1843
1844             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1845                 if mobj.group(1) not in ids_in_page:
1846                     ids_in_page.append(mobj.group(1))
1847
1848             video_ids.extend(ids_in_page)
1849
1850             # A little optimization - if current page is not
1851             # "full", ie. does not contain PAGE_SIZE video ids then
1852             # we can assume that this page is the last one - there
1853             # are no more ids on further pages - no need to query
1854             # again.
1855
1856             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1857                 break
1858
1859             pagenum += 1
1860
1861         all_ids_count = len(video_ids)
1862         playliststart = self._downloader.params.get('playliststart', 1) - 1
1863         playlistend = self._downloader.params.get('playlistend', -1)
1864
1865         if playlistend == -1:
1866             video_ids = video_ids[playliststart:]
1867         else:
1868             video_ids = video_ids[playliststart:playlistend]
1869
1870         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1871                 (username, all_ids_count, len(video_ids)))
1872
1873         for video_id in video_ids:
1874             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1875
1876
1877 class BlipTVUserIE(InfoExtractor):
1878     """Information Extractor for blip.tv users."""
1879
1880     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1881     _PAGE_SIZE = 12
1882     IE_NAME = u'blip.tv:user'
1883
1884     def __init__(self, downloader=None):
1885         InfoExtractor.__init__(self, downloader)
1886
1887     def report_download_page(self, username, pagenum):
1888         """Report attempt to download user page."""
1889         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1890                 (self.IE_NAME, username, pagenum))
1891
1892     def _real_extract(self, url):
1893         # Extract username
1894         mobj = re.match(self._VALID_URL, url)
1895         if mobj is None:
1896             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1897             return
1898
1899         username = mobj.group(1)
1900
1901         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1902
1903         request = compat_urllib_request.Request(url)
1904
1905         try:
1906             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1907             mobj = re.search(r'data-users-id="([^"]+)"', page)
1908             page_base = page_base % mobj.group(1)
1909         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1910             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1911             return
1912
1913
1914         # Download video ids using BlipTV Ajax calls. Result size per
1915         # query is limited (currently to 12 videos) so we need to query
1916         # page by page until there are no video ids - it means we got
1917         # all of them.
1918
1919         video_ids = []
1920         pagenum = 1
1921
1922         while True:
1923             self.report_download_page(username, pagenum)
1924
1925             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1926
1927             try:
1928                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1929             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1930                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1931                 return
1932
1933             # Extract video identifiers
1934             ids_in_page = []
1935
1936             for mobj in re.finditer(r'href="/([^"]+)"', page):
1937                 if mobj.group(1) not in ids_in_page:
1938                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1939
1940             video_ids.extend(ids_in_page)
1941
1942             # A little optimization - if current page is not
1943             # "full", ie. does not contain PAGE_SIZE video ids then
1944             # we can assume that this page is the last one - there
1945             # are no more ids on further pages - no need to query
1946             # again.
1947
1948             if len(ids_in_page) < self._PAGE_SIZE:
1949                 break
1950
1951             pagenum += 1
1952
1953         all_ids_count = len(video_ids)
1954         playliststart = self._downloader.params.get('playliststart', 1) - 1
1955         playlistend = self._downloader.params.get('playlistend', -1)
1956
1957         if playlistend == -1:
1958             video_ids = video_ids[playliststart:]
1959         else:
1960             video_ids = video_ids[playliststart:playlistend]
1961
1962         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1963                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1964
1965         for video_id in video_ids:
1966             self._downloader.download([u'http://blip.tv/'+video_id])
1967
1968
1969 class DepositFilesIE(InfoExtractor):
1970     """Information extractor for depositfiles.com"""
1971
1972     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1973
1974     def report_download_webpage(self, file_id):
1975         """Report webpage download."""
1976         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1977
1978     def report_extraction(self, file_id):
1979         """Report information extraction."""
1980         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1981
1982     def _real_extract(self, url):
1983         file_id = url.split('/')[-1]
1984         # Rebuild url in english locale
1985         url = 'http://depositfiles.com/en/files/' + file_id
1986
1987         # Retrieve file webpage with 'Free download' button pressed
1988         free_download_indication = { 'gateway_result' : '1' }
1989         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1990         try:
1991             self.report_download_webpage(file_id)
1992             webpage = compat_urllib_request.urlopen(request).read()
1993         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1994             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1995             return
1996
1997         # Search for the real file URL
1998         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1999         if (mobj is None) or (mobj.group(1) is None):
2000             # Try to figure out reason of the error.
2001             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2002             if (mobj is not None) and (mobj.group(1) is not None):
2003                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2004                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2005             else:
2006                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2007             return
2008
2009         file_url = mobj.group(1)
2010         file_extension = os.path.splitext(file_url)[1][1:]
2011
2012         # Search for file title
2013         mobj = re.search(r'<b title="(.*?)">', webpage)
2014         if mobj is None:
2015             self._downloader.trouble(u'ERROR: unable to extract title')
2016             return
2017         file_title = mobj.group(1).decode('utf-8')
2018
2019         return [{
2020             'id':       file_id.decode('utf-8'),
2021             'url':      file_url.decode('utf-8'),
2022             'uploader': None,
2023             'upload_date':  None,
2024             'title':    file_title,
2025             'ext':      file_extension.decode('utf-8'),
2026         }]
2027
2028
2029 class FacebookIE(InfoExtractor):
2030     """Information Extractor for Facebook"""
2031
2032     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2033     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2034     _NETRC_MACHINE = 'facebook'
2035     IE_NAME = u'facebook'
2036
2037     def report_login(self):
2038         """Report attempt to log in."""
2039         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2040
2041     def _real_initialize(self):
2042         if self._downloader is None:
2043             return
2044
2045         useremail = None
2046         password = None
2047         downloader_params = self._downloader.params
2048
2049         # Attempt to use provided username and password or .netrc data
2050         if downloader_params.get('username', None) is not None:
2051             useremail = downloader_params['username']
2052             password = downloader_params['password']
2053         elif downloader_params.get('usenetrc', False):
2054             try:
2055                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2056                 if info is not None:
2057                     useremail = info[0]
2058                     password = info[2]
2059                 else:
2060                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2061             except (IOError, netrc.NetrcParseError) as err:
2062                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2063                 return
2064
2065         if useremail is None:
2066             return
2067
2068         # Log in
2069         login_form = {
2070             'email': useremail,
2071             'pass': password,
2072             'login': 'Log+In'
2073             }
2074         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2075         try:
2076             self.report_login()
2077             login_results = compat_urllib_request.urlopen(request).read()
2078             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2079                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2080                 return
2081         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2082             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2083             return
2084
2085     def _real_extract(self, url):
2086         mobj = re.match(self._VALID_URL, url)
2087         if mobj is None:
2088             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2089             return
2090         video_id = mobj.group('ID')
2091
2092         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2093         webpage = self._download_webpage(url, video_id)
2094
2095         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2096         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2097         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2098         if not m:
2099             raise ExtractorError(u'Cannot parse data')
2100         data = dict(json.loads(m.group(1)))
2101         params_raw = compat_urllib_parse.unquote(data['params'])
2102         params = json.loads(params_raw)
2103         video_url = params['hd_src']
2104         if not video_url:
2105             video_url = params['sd_src']
2106         if not video_url:
2107             raise ExtractorError(u'Cannot find video URL')
2108         video_duration = int(params['video_duration'])
2109
2110         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2111         if not m:
2112             raise ExtractorError(u'Cannot find title in webpage')
2113         video_title = unescapeHTML(m.group(1))
2114
2115         info = {
2116             'id': video_id,
2117             'title': video_title,
2118             'url': video_url,
2119             'ext': 'mp4',
2120             'duration': video_duration,
2121             'thumbnail': params['thumbnail_src'],
2122         }
2123         return [info]
2124
2125
2126 class BlipTVIE(InfoExtractor):
2127     """Information extractor for blip.tv"""
2128
2129     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2130     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2131     IE_NAME = u'blip.tv'
2132
2133     def report_extraction(self, file_id):
2134         """Report information extraction."""
2135         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2136
2137     def report_direct_download(self, title):
2138         """Report information extraction."""
2139         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2140
2141     def _real_extract(self, url):
2142         mobj = re.match(self._VALID_URL, url)
2143         if mobj is None:
2144             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2145             return
2146
2147         if '?' in url:
2148             cchar = '&'
2149         else:
2150             cchar = '?'
2151         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2152         request = compat_urllib_request.Request(json_url)
2153         request.add_header('User-Agent', 'iTunes/10.6.1')
2154         self.report_extraction(mobj.group(1))
2155         info = None
2156         try:
2157             urlh = compat_urllib_request.urlopen(request)
2158             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2159                 basename = url.split('/')[-1]
2160                 title,ext = os.path.splitext(basename)
2161                 title = title.decode('UTF-8')
2162                 ext = ext.replace('.', '')
2163                 self.report_direct_download(title)
2164                 info = {
2165                     'id': title,
2166                     'url': url,
2167                     'uploader': None,
2168                     'upload_date': None,
2169                     'title': title,
2170                     'ext': ext,
2171                     'urlhandle': urlh
2172                 }
2173         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2174             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2175         if info is None: # Regular URL
2176             try:
2177                 json_code_bytes = urlh.read()
2178                 json_code = json_code_bytes.decode('utf-8')
2179             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2180                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2181                 return
2182
2183             try:
2184                 json_data = json.loads(json_code)
2185                 if 'Post' in json_data:
2186                     data = json_data['Post']
2187                 else:
2188                     data = json_data
2189
2190                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2191                 video_url = data['media']['url']
2192                 umobj = re.match(self._URL_EXT, video_url)
2193                 if umobj is None:
2194                     raise ValueError('Can not determine filename extension')
2195                 ext = umobj.group(1)
2196
2197                 info = {
2198                     'id': data['item_id'],
2199                     'url': video_url,
2200                     'uploader': data['display_name'],
2201                     'upload_date': upload_date,
2202                     'title': data['title'],
2203                     'ext': ext,
2204                     'format': data['media']['mimeType'],
2205                     'thumbnail': data['thumbnailUrl'],
2206                     'description': data['description'],
2207                     'player_url': data['embedUrl'],
2208                     'user_agent': 'iTunes/10.6.1',
2209                 }
2210             except (ValueError,KeyError) as err:
2211                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2212                 return
2213
2214         return [info]
2215
2216
2217 class MyVideoIE(InfoExtractor):
2218     """Information Extractor for myvideo.de."""
2219
2220     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2221     IE_NAME = u'myvideo'
2222
2223     def __init__(self, downloader=None):
2224         InfoExtractor.__init__(self, downloader)
2225
2226     def report_extraction(self, video_id):
2227         """Report information extraction."""
2228         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2229
2230     def _real_extract(self,url):
2231         mobj = re.match(self._VALID_URL, url)
2232         if mobj is None:
2233             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2234             return
2235
2236         video_id = mobj.group(1)
2237
2238         # Get video webpage
2239         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2240         webpage = self._download_webpage(webpage_url, video_id)
2241
2242         self.report_extraction(video_id)
2243         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2244                  webpage)
2245         if mobj is None:
2246             self._downloader.trouble(u'ERROR: unable to extract media URL')
2247             return
2248         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2249
2250         mobj = re.search('<title>([^<]+)</title>', webpage)
2251         if mobj is None:
2252             self._downloader.trouble(u'ERROR: unable to extract title')
2253             return
2254
2255         video_title = mobj.group(1)
2256
2257         return [{
2258             'id':       video_id,
2259             'url':      video_url,
2260             'uploader': None,
2261             'upload_date':  None,
2262             'title':    video_title,
2263             'ext':      u'flv',
2264         }]
2265
2266 class ComedyCentralIE(InfoExtractor):
2267     """Information extractor for The Daily Show and Colbert Report """
2268
2269     # urls can be abbreviations like :thedailyshow or :colbert
2270     # urls for episodes like:
2271     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2272     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2273     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2274     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2275                       |(https?://)?(www\.)?
2276                           (?P<showname>thedailyshow|colbertnation)\.com/
2277                          (full-episodes/(?P<episode>.*)|
2278                           (?P<clip>
2279                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2280                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2281                      $"""
2282
2283     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2284
2285     _video_extensions = {
2286         '3500': 'mp4',
2287         '2200': 'mp4',
2288         '1700': 'mp4',
2289         '1200': 'mp4',
2290         '750': 'mp4',
2291         '400': 'mp4',
2292     }
2293     _video_dimensions = {
2294         '3500': '1280x720',
2295         '2200': '960x540',
2296         '1700': '768x432',
2297         '1200': '640x360',
2298         '750': '512x288',
2299         '400': '384x216',
2300     }
2301
2302     def suitable(self, url):
2303         """Receives a URL and returns True if suitable for this IE."""
2304         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2305
2306     def report_extraction(self, episode_id):
2307         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2308
2309     def report_config_download(self, episode_id, media_id):
2310         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2311
2312     def report_index_download(self, episode_id):
2313         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2314
2315     def _print_formats(self, formats):
2316         print('Available formats:')
2317         for x in formats:
2318             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2319
2320
2321     def _real_extract(self, url):
2322         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2323         if mobj is None:
2324             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2325             return
2326
2327         if mobj.group('shortname'):
2328             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2329                 url = u'http://www.thedailyshow.com/full-episodes/'
2330             else:
2331                 url = u'http://www.colbertnation.com/full-episodes/'
2332             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2333             assert mobj is not None
2334
2335         if mobj.group('clip'):
2336             if mobj.group('showname') == 'thedailyshow':
2337                 epTitle = mobj.group('tdstitle')
2338             else:
2339                 epTitle = mobj.group('cntitle')
2340             dlNewest = False
2341         else:
2342             dlNewest = not mobj.group('episode')
2343             if dlNewest:
2344                 epTitle = mobj.group('showname')
2345             else:
2346                 epTitle = mobj.group('episode')
2347
2348         req = compat_urllib_request.Request(url)
2349         self.report_extraction(epTitle)
2350         try:
2351             htmlHandle = compat_urllib_request.urlopen(req)
2352             html = htmlHandle.read()
2353             webpage = html.decode('utf-8')
2354         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2355             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2356             return
2357         if dlNewest:
2358             url = htmlHandle.geturl()
2359             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2360             if mobj is None:
2361                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2362                 return
2363             if mobj.group('episode') == '':
2364                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2365                 return
2366             epTitle = mobj.group('episode')
2367
2368         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2369
2370         if len(mMovieParams) == 0:
2371             # The Colbert Report embeds the information in a without
2372             # a URL prefix; so extract the alternate reference
2373             # and then add the URL prefix manually.
2374
2375             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2376             if len(altMovieParams) == 0:
2377                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2378                 return
2379             else:
2380                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2381
2382         uri = mMovieParams[0][1]
2383         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2384         self.report_index_download(epTitle)
2385         try:
2386             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2388             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2389             return
2390
2391         results = []
2392
2393         idoc = xml.etree.ElementTree.fromstring(indexXml)
2394         itemEls = idoc.findall('.//item')
2395         for partNum,itemEl in enumerate(itemEls):
2396             mediaId = itemEl.findall('./guid')[0].text
2397             shortMediaId = mediaId.split(':')[-1]
2398             showId = mediaId.split(':')[-2].replace('.com', '')
2399             officialTitle = itemEl.findall('./title')[0].text
2400             officialDate = itemEl.findall('./pubDate')[0].text
2401
2402             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2403                         compat_urllib_parse.urlencode({'uri': mediaId}))
2404             configReq = compat_urllib_request.Request(configUrl)
2405             self.report_config_download(epTitle, shortMediaId)
2406             try:
2407                 configXml = compat_urllib_request.urlopen(configReq).read()
2408             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2409                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2410                 return
2411
2412             cdoc = xml.etree.ElementTree.fromstring(configXml)
2413             turls = []
2414             for rendition in cdoc.findall('.//rendition'):
2415                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2416                 turls.append(finfo)
2417
2418             if len(turls) == 0:
2419                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2420                 continue
2421
2422             if self._downloader.params.get('listformats', None):
2423                 self._print_formats([i[0] for i in turls])
2424                 return
2425
2426             # For now, just pick the highest bitrate
2427             format,rtmp_video_url = turls[-1]
2428
2429             # Get the format arg from the arg stream
2430             req_format = self._downloader.params.get('format', None)
2431
2432             # Select format if we can find one
2433             for f,v in turls:
2434                 if f == req_format:
2435                     format, rtmp_video_url = f, v
2436                     break
2437
2438             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2439             if not m:
2440                 raise ExtractorError(u'Cannot transform RTMP url')
2441             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2442             video_url = base + m.group('finalid')
2443
2444             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2445             info = {
2446                 'id': shortMediaId,
2447                 'url': video_url,
2448                 'uploader': showId,
2449                 'upload_date': officialDate,
2450                 'title': effTitle,
2451                 'ext': 'mp4',
2452                 'format': format,
2453                 'thumbnail': None,
2454                 'description': officialTitle,
2455             }
2456             results.append(info)
2457
2458         return results
2459
2460
2461 class EscapistIE(InfoExtractor):
2462     """Information extractor for The Escapist """
2463
2464     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2465     IE_NAME = u'escapist'
2466
2467     def report_extraction(self, showName):
2468         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2469
2470     def report_config_download(self, showName):
2471         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2472
2473     def _real_extract(self, url):
2474         mobj = re.match(self._VALID_URL, url)
2475         if mobj is None:
2476             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2477             return
2478         showName = mobj.group('showname')
2479         videoId = mobj.group('episode')
2480
2481         self.report_extraction(showName)
2482         try:
2483             webPage = compat_urllib_request.urlopen(url)
2484             webPageBytes = webPage.read()
2485             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2486             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2487         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2488             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2489             return
2490
2491         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2492         description = unescapeHTML(descMatch.group(1))
2493         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2494         imgUrl = unescapeHTML(imgMatch.group(1))
2495         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2496         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2497         configUrlMatch = re.search('config=(.*)$', playerUrl)
2498         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2499
2500         self.report_config_download(showName)
2501         try:
2502             configJSON = compat_urllib_request.urlopen(configUrl)
2503             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2504             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2505         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2506             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2507             return
2508
2509         # Technically, it's JavaScript, not JSON
2510         configJSON = configJSON.replace("'", '"')
2511
2512         try:
2513             config = json.loads(configJSON)
2514         except (ValueError,) as err:
2515             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2516             return
2517
2518         playlist = config['playlist']
2519         videoUrl = playlist[1]['url']
2520
2521         info = {
2522             'id': videoId,
2523             'url': videoUrl,
2524             'uploader': showName,
2525             'upload_date': None,
2526             'title': showName,
2527             'ext': 'flv',
2528             'thumbnail': imgUrl,
2529             'description': description,
2530             'player_url': playerUrl,
2531         }
2532
2533         return [info]
2534
2535 class CollegeHumorIE(InfoExtractor):
2536     """Information extractor for collegehumor.com"""
2537
2538     _WORKING = False
2539     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2540     IE_NAME = u'collegehumor'
2541
2542     def report_manifest(self, video_id):
2543         """Report information extraction."""
2544         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2545
2546     def report_extraction(self, video_id):
2547         """Report information extraction."""
2548         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2549
2550     def _real_extract(self, url):
2551         mobj = re.match(self._VALID_URL, url)
2552         if mobj is None:
2553             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2554             return
2555         video_id = mobj.group('videoid')
2556
2557         info = {
2558             'id': video_id,
2559             'uploader': None,
2560             'upload_date': None,
2561         }
2562
2563         self.report_extraction(video_id)
2564         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2565         try:
2566             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2567         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2568             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2569             return
2570
2571         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2572         try:
2573             videoNode = mdoc.findall('./video')[0]
2574             info['description'] = videoNode.findall('./description')[0].text
2575             info['title'] = videoNode.findall('./caption')[0].text
2576             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2577             manifest_url = videoNode.findall('./file')[0].text
2578         except IndexError:
2579             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2580             return
2581
2582         manifest_url += '?hdcore=2.10.3'
2583         self.report_manifest(video_id)
2584         try:
2585             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2586         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2587             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2588             return
2589
2590         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2591         try:
2592             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2593             node_id = media_node.attrib['url']
2594             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2595         except IndexError as err:
2596             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2597             return
2598
2599         url_pr = compat_urllib_parse_urlparse(manifest_url)
2600         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2601
2602         info['url'] = url
2603         info['ext'] = 'f4f'
2604         return [info]
2605
2606
2607 class XVideosIE(InfoExtractor):
2608     """Information extractor for xvideos.com"""
2609
2610     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2611     IE_NAME = u'xvideos'
2612
2613     def report_extraction(self, video_id):
2614         """Report information extraction."""
2615         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2616
2617     def _real_extract(self, url):
2618         mobj = re.match(self._VALID_URL, url)
2619         if mobj is None:
2620             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2621             return
2622         video_id = mobj.group(1)
2623
2624         webpage = self._download_webpage(url, video_id)
2625
2626         self.report_extraction(video_id)
2627
2628
2629         # Extract video URL
2630         mobj = re.search(r'flv_url=(.+?)&', webpage)
2631         if mobj is None:
2632             self._downloader.trouble(u'ERROR: unable to extract video url')
2633             return
2634         video_url = compat_urllib_parse.unquote(mobj.group(1))
2635
2636
2637         # Extract title
2638         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2639         if mobj is None:
2640             self._downloader.trouble(u'ERROR: unable to extract video title')
2641             return
2642         video_title = mobj.group(1)
2643
2644
2645         # Extract video thumbnail
2646         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2647         if mobj is None:
2648             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2649             return
2650         video_thumbnail = mobj.group(0)
2651
2652         info = {
2653             'id': video_id,
2654             'url': video_url,
2655             'uploader': None,
2656             'upload_date': None,
2657             'title': video_title,
2658             'ext': 'flv',
2659             'thumbnail': video_thumbnail,
2660             'description': None,
2661         }
2662
2663         return [info]
2664
2665
2666 class SoundcloudIE(InfoExtractor):
2667     """Information extractor for soundcloud.com
2668        To access the media, the uid of the song and a stream token
2669        must be extracted from the page source and the script must make
2670        a request to media.soundcloud.com/crossdomain.xml. Then
2671        the media can be grabbed by requesting from an url composed
2672        of the stream token and uid
2673      """
2674
2675     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2676     IE_NAME = u'soundcloud'
2677
2678     def __init__(self, downloader=None):
2679         InfoExtractor.__init__(self, downloader)
2680
2681     def report_resolve(self, video_id):
2682         """Report information extraction."""
2683         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2684
2685     def report_extraction(self, video_id):
2686         """Report information extraction."""
2687         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2688
2689     def _real_extract(self, url):
2690         mobj = re.match(self._VALID_URL, url)
2691         if mobj is None:
2692             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2693             return
2694
2695         # extract uploader (which is in the url)
2696         uploader = mobj.group(1)
2697         # extract simple title (uploader + slug of song title)
2698         slug_title =  mobj.group(2)
2699         simple_title = uploader + u'-' + slug_title
2700
2701         self.report_resolve('%s/%s' % (uploader, slug_title))
2702
2703         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2704         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2705         request = compat_urllib_request.Request(resolv_url)
2706         try:
2707             info_json_bytes = compat_urllib_request.urlopen(request).read()
2708             info_json = info_json_bytes.decode('utf-8')
2709         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2710             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2711             return
2712
2713         info = json.loads(info_json)
2714         video_id = info['id']
2715         self.report_extraction('%s/%s' % (uploader, slug_title))
2716
2717         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2718         request = compat_urllib_request.Request(streams_url)
2719         try:
2720             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2721             stream_json = stream_json_bytes.decode('utf-8')
2722         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2723             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2724             return
2725
2726         streams = json.loads(stream_json)
2727         mediaURL = streams['http_mp3_128_url']
2728
2729         return [{
2730             'id':       info['id'],
2731             'url':      mediaURL,
2732             'uploader': info['user']['username'],
2733             'upload_date':  info['created_at'],
2734             'title':    info['title'],
2735             'ext':      u'mp3',
2736             'description': info['description'],
2737         }]
2738
2739
2740 class InfoQIE(InfoExtractor):
2741     """Information extractor for infoq.com"""
2742     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2743
2744     def report_extraction(self, video_id):
2745         """Report information extraction."""
2746         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2747
2748     def _real_extract(self, url):
2749         mobj = re.match(self._VALID_URL, url)
2750         if mobj is None:
2751             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2752             return
2753
2754         webpage = self._download_webpage(url, video_id=url)
2755         self.report_extraction(url)
2756
2757         # Extract video URL
2758         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2759         if mobj is None:
2760             self._downloader.trouble(u'ERROR: unable to extract video url')
2761             return
2762         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2763         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2764
2765         # Extract title
2766         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2767         if mobj is None:
2768             self._downloader.trouble(u'ERROR: unable to extract video title')
2769             return
2770         video_title = mobj.group(1)
2771
2772         # Extract description
2773         video_description = u'No description available.'
2774         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2775         if mobj is not None:
2776             video_description = mobj.group(1)
2777
2778         video_filename = video_url.split('/')[-1]
2779         video_id, extension = video_filename.split('.')
2780
2781         info = {
2782             'id': video_id,
2783             'url': video_url,
2784             'uploader': None,
2785             'upload_date': None,
2786             'title': video_title,
2787             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2788             'thumbnail': None,
2789             'description': video_description,
2790         }
2791
2792         return [info]
2793
2794 class MixcloudIE(InfoExtractor):
2795     """Information extractor for www.mixcloud.com"""
2796
2797     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2798     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2799     IE_NAME = u'mixcloud'
2800
2801     def __init__(self, downloader=None):
2802         InfoExtractor.__init__(self, downloader)
2803
2804     def report_download_json(self, file_id):
2805         """Report JSON download."""
2806         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2807
2808     def report_extraction(self, file_id):
2809         """Report information extraction."""
2810         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2811
2812     def get_urls(self, jsonData, fmt, bitrate='best'):
2813         """Get urls from 'audio_formats' section in json"""
2814         file_url = None
2815         try:
2816             bitrate_list = jsonData[fmt]
2817             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2818                 bitrate = max(bitrate_list) # select highest
2819
2820             url_list = jsonData[fmt][bitrate]
2821         except TypeError: # we have no bitrate info.
2822             url_list = jsonData[fmt]
2823         return url_list
2824
2825     def check_urls(self, url_list):
2826         """Returns 1st active url from list"""
2827         for url in url_list:
2828             try:
2829                 compat_urllib_request.urlopen(url)
2830                 return url
2831             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2832                 url = None
2833
2834         return None
2835
2836     def _print_formats(self, formats):
2837         print('Available formats:')
2838         for fmt in formats.keys():
2839             for b in formats[fmt]:
2840                 try:
2841                     ext = formats[fmt][b][0]
2842                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2843                 except TypeError: # we have no bitrate info
2844                     ext = formats[fmt][0]
2845                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2846                     break
2847
2848     def _real_extract(self, url):
2849         mobj = re.match(self._VALID_URL, url)
2850         if mobj is None:
2851             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2852             return
2853         # extract uploader & filename from url
2854         uploader = mobj.group(1).decode('utf-8')
2855         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2856
2857         # construct API request
2858         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2859         # retrieve .json file with links to files
2860         request = compat_urllib_request.Request(file_url)
2861         try:
2862             self.report_download_json(file_url)
2863             jsonData = compat_urllib_request.urlopen(request).read()
2864         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2865             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2866             return
2867
2868         # parse JSON
2869         json_data = json.loads(jsonData)
2870         player_url = json_data['player_swf_url']
2871         formats = dict(json_data['audio_formats'])
2872
2873         req_format = self._downloader.params.get('format', None)
2874         bitrate = None
2875
2876         if self._downloader.params.get('listformats', None):
2877             self._print_formats(formats)
2878             return
2879
2880         if req_format is None or req_format == 'best':
2881             for format_param in formats.keys():
2882                 url_list = self.get_urls(formats, format_param)
2883                 # check urls
2884                 file_url = self.check_urls(url_list)
2885                 if file_url is not None:
2886                     break # got it!
2887         else:
2888             if req_format not in formats:
2889                 self._downloader.trouble(u'ERROR: format is not available')
2890                 return
2891
2892             url_list = self.get_urls(formats, req_format)
2893             file_url = self.check_urls(url_list)
2894             format_param = req_format
2895
2896         return [{
2897             'id': file_id.decode('utf-8'),
2898             'url': file_url.decode('utf-8'),
2899             'uploader': uploader.decode('utf-8'),
2900             'upload_date': None,
2901             'title': json_data['name'],
2902             'ext': file_url.split('.')[-1].decode('utf-8'),
2903             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2904             'thumbnail': json_data['thumbnail_url'],
2905             'description': json_data['description'],
2906             'player_url': player_url.decode('utf-8'),
2907         }]
2908
2909 class StanfordOpenClassroomIE(InfoExtractor):
2910     """Information extractor for Stanford's Open ClassRoom"""
2911
2912     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2913     IE_NAME = u'stanfordoc'
2914
2915     def report_download_webpage(self, objid):
2916         """Report information extraction."""
2917         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2918
2919     def report_extraction(self, video_id):
2920         """Report information extraction."""
2921         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2922
2923     def _real_extract(self, url):
2924         mobj = re.match(self._VALID_URL, url)
2925         if mobj is None:
2926             raise ExtractorError(u'Invalid URL: %s' % url)
2927
2928         if mobj.group('course') and mobj.group('video'): # A specific video
2929             course = mobj.group('course')
2930             video = mobj.group('video')
2931             info = {
2932                 'id': course + '_' + video,
2933                 'uploader': None,
2934                 'upload_date': None,
2935             }
2936
2937             self.report_extraction(info['id'])
2938             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2939             xmlUrl = baseUrl + video + '.xml'
2940             try:
2941                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2942             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2943                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2944                 return
2945             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2946             try:
2947                 info['title'] = mdoc.findall('./title')[0].text
2948                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2949             except IndexError:
2950                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2951                 return
2952             info['ext'] = info['url'].rpartition('.')[2]
2953             return [info]
2954         elif mobj.group('course'): # A course page
2955             course = mobj.group('course')
2956             info = {
2957                 'id': course,
2958                 'type': 'playlist',
2959                 'uploader': None,
2960                 'upload_date': None,
2961             }
2962
2963             coursepage = self._download_webpage(url, info['id'],
2964                                         note='Downloading course info page',
2965                                         errnote='Unable to download course info page')
2966
2967             m = re.search('<h1>([^<]+)</h1>', coursepage)
2968             if m:
2969                 info['title'] = unescapeHTML(m.group(1))
2970             else:
2971                 info['title'] = info['id']
2972
2973             m = re.search('<description>([^<]+)</description>', coursepage)
2974             if m:
2975                 info['description'] = unescapeHTML(m.group(1))
2976
2977             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2978             info['list'] = [
2979                 {
2980                     'type': 'reference',
2981                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2982                 }
2983                     for vpage in links]
2984             results = []
2985             for entry in info['list']:
2986                 assert entry['type'] == 'reference'
2987                 results += self.extract(entry['url'])
2988             return results
2989         else: # Root page
2990             info = {
2991                 'id': 'Stanford OpenClassroom',
2992                 'type': 'playlist',
2993                 'uploader': None,
2994                 'upload_date': None,
2995             }
2996
2997             self.report_download_webpage(info['id'])
2998             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2999             try:
3000                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3001             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3002                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3003                 return
3004
3005             info['title'] = info['id']
3006
3007             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3008             info['list'] = [
3009                 {
3010                     'type': 'reference',
3011                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3012                 }
3013                     for cpage in links]
3014
3015             results = []
3016             for entry in info['list']:
3017                 assert entry['type'] == 'reference'
3018                 results += self.extract(entry['url'])
3019             return results
3020
3021 class MTVIE(InfoExtractor):
3022     """Information extractor for MTV.com"""
3023
3024     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3025     IE_NAME = u'mtv'
3026
3027     def report_extraction(self, video_id):
3028         """Report information extraction."""
3029         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3030
3031     def _real_extract(self, url):
3032         mobj = re.match(self._VALID_URL, url)
3033         if mobj is None:
3034             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3035             return
3036         if not mobj.group('proto'):
3037             url = 'http://' + url
3038         video_id = mobj.group('videoid')
3039
3040         webpage = self._download_webpage(url, video_id)
3041
3042         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3043         if mobj is None:
3044             self._downloader.trouble(u'ERROR: unable to extract song name')
3045             return
3046         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3047         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3048         if mobj is None:
3049             self._downloader.trouble(u'ERROR: unable to extract performer')
3050             return
3051         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3052         video_title = performer + ' - ' + song_name
3053
3054         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3055         if mobj is None:
3056             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3057             return
3058         mtvn_uri = mobj.group(1)
3059
3060         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3061         if mobj is None:
3062             self._downloader.trouble(u'ERROR: unable to extract content id')
3063             return
3064         content_id = mobj.group(1)
3065
3066         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3067         self.report_extraction(video_id)
3068         request = compat_urllib_request.Request(videogen_url)
3069         try:
3070             metadataXml = compat_urllib_request.urlopen(request).read()
3071         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3072             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3073             return
3074
3075         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3076         renditions = mdoc.findall('.//rendition')
3077
3078         # For now, always pick the highest quality.
3079         rendition = renditions[-1]
3080
3081         try:
3082             _,_,ext = rendition.attrib['type'].partition('/')
3083             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3084             video_url = rendition.find('./src').text
3085         except KeyError:
3086             self._downloader.trouble('Invalid rendition field.')
3087             return
3088
3089         info = {
3090             'id': video_id,
3091             'url': video_url,
3092             'uploader': performer,
3093             'upload_date': None,
3094             'title': video_title,
3095             'ext': ext,
3096             'format': format,
3097         }
3098
3099         return [info]
3100
3101
3102 class YoukuIE(InfoExtractor):
3103     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3104
3105     def report_download_webpage(self, file_id):
3106         """Report webpage download."""
3107         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3108
3109     def report_extraction(self, file_id):
3110         """Report information extraction."""
3111         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3112
3113     def _gen_sid(self):
3114         nowTime = int(time.time() * 1000)
3115         random1 = random.randint(1000,1998)
3116         random2 = random.randint(1000,9999)
3117
3118         return "%d%d%d" %(nowTime,random1,random2)
3119
3120     def _get_file_ID_mix_string(self, seed):
3121         mixed = []
3122         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3123         seed = float(seed)
3124         for i in range(len(source)):
3125             seed  =  (seed * 211 + 30031 ) % 65536
3126             index  =  math.floor(seed / 65536 * len(source) )
3127             mixed.append(source[int(index)])
3128             source.remove(source[int(index)])
3129         #return ''.join(mixed)
3130         return mixed
3131
3132     def _get_file_id(self, fileId, seed):
3133         mixed = self._get_file_ID_mix_string(seed)
3134         ids = fileId.split('*')
3135         realId = []
3136         for ch in ids:
3137             if ch:
3138                 realId.append(mixed[int(ch)])
3139         return ''.join(realId)
3140
3141     def _real_extract(self, url):
3142         mobj = re.match(self._VALID_URL, url)
3143         if mobj is None:
3144             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3145             return
3146         video_id = mobj.group('ID')
3147
3148         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3149
3150         request = compat_urllib_request.Request(info_url, None, std_headers)
3151         try:
3152             self.report_download_webpage(video_id)
3153             jsondata = compat_urllib_request.urlopen(request).read()
3154         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3155             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3156             return
3157
3158         self.report_extraction(video_id)
3159         try:
3160             jsonstr = jsondata.decode('utf-8')
3161             config = json.loads(jsonstr)
3162
3163             video_title =  config['data'][0]['title']
3164             seed = config['data'][0]['seed']
3165
3166             format = self._downloader.params.get('format', None)
3167             supported_format = list(config['data'][0]['streamfileids'].keys())
3168
3169             if format is None or format == 'best':
3170                 if 'hd2' in supported_format:
3171                     format = 'hd2'
3172                 else:
3173                     format = 'flv'
3174                 ext = u'flv'
3175             elif format == 'worst':
3176                 format = 'mp4'
3177                 ext = u'mp4'
3178             else:
3179                 format = 'flv'
3180                 ext = u'flv'
3181
3182
3183             fileid = config['data'][0]['streamfileids'][format]
3184             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3185         except (UnicodeDecodeError, ValueError, KeyError):
3186             self._downloader.trouble(u'ERROR: unable to extract info section')
3187             return
3188
3189         files_info=[]
3190         sid = self._gen_sid()
3191         fileid = self._get_file_id(fileid, seed)
3192
3193         #column 8,9 of fileid represent the segment number
3194         #fileid[7:9] should be changed
3195         for index, key in enumerate(keys):
3196
3197             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3198             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3199
3200             info = {
3201                 'id': '%s_part%02d' % (video_id, index),
3202                 'url': download_url,
3203                 'uploader': None,
3204                 'upload_date': None,
3205                 'title': video_title,
3206                 'ext': ext,
3207             }
3208             files_info.append(info)
3209
3210         return files_info
3211
3212
3213 class XNXXIE(InfoExtractor):
3214     """Information extractor for xnxx.com"""
3215
3216     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3217     IE_NAME = u'xnxx'
3218     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3219     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3220     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3221
3222     def report_webpage(self, video_id):
3223         """Report information extraction"""
3224         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3225
3226     def report_extraction(self, video_id):
3227         """Report information extraction"""
3228         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3229
3230     def _real_extract(self, url):
3231         mobj = re.match(self._VALID_URL, url)
3232         if mobj is None:
3233             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3234             return
3235         video_id = mobj.group(1)
3236
3237         self.report_webpage(video_id)
3238
3239         # Get webpage content
3240         try:
3241             webpage_bytes = compat_urllib_request.urlopen(url).read()
3242             webpage = webpage_bytes.decode('utf-8')
3243         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3244             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3245             return
3246
3247         result = re.search(self.VIDEO_URL_RE, webpage)
3248         if result is None:
3249             self._downloader.trouble(u'ERROR: unable to extract video url')
3250             return
3251         video_url = compat_urllib_parse.unquote(result.group(1))
3252
3253         result = re.search(self.VIDEO_TITLE_RE, webpage)
3254         if result is None:
3255             self._downloader.trouble(u'ERROR: unable to extract video title')
3256             return
3257         video_title = result.group(1)
3258
3259         result = re.search(self.VIDEO_THUMB_RE, webpage)
3260         if result is None:
3261             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3262             return
3263         video_thumbnail = result.group(1)
3264
3265         return [{
3266             'id': video_id,
3267             'url': video_url,
3268             'uploader': None,
3269             'upload_date': None,
3270             'title': video_title,
3271             'ext': 'flv',
3272             'thumbnail': video_thumbnail,
3273             'description': None,
3274         }]
3275
3276
3277 class GooglePlusIE(InfoExtractor):
3278     """Information extractor for plus.google.com."""
3279
3280     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3281     IE_NAME = u'plus.google'
3282
3283     def __init__(self, downloader=None):
3284         InfoExtractor.__init__(self, downloader)
3285
3286     def report_extract_entry(self, url):
3287         """Report downloading extry"""
3288         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3289
3290     def report_date(self, upload_date):
3291         """Report downloading extry"""
3292         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3293
3294     def report_uploader(self, uploader):
3295         """Report downloading extry"""
3296         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3297
3298     def report_title(self, video_title):
3299         """Report downloading extry"""
3300         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3301
3302     def report_extract_vid_page(self, video_page):
3303         """Report information extraction."""
3304         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3305
3306     def _real_extract(self, url):
3307         # Extract id from URL
3308         mobj = re.match(self._VALID_URL, url)
3309         if mobj is None:
3310             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3311             return
3312
3313         post_url = mobj.group(0)
3314         video_id = mobj.group(1)
3315
3316         video_extension = 'flv'
3317
3318         # Step 1, Retrieve post webpage to extract further information
3319         self.report_extract_entry(post_url)
3320         request = compat_urllib_request.Request(post_url)
3321         try:
3322             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3323         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3324             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3325             return
3326
3327         # Extract update date
3328         upload_date = None
3329         pattern = 'title="Timestamp">(.*?)</a>'
3330         mobj = re.search(pattern, webpage)
3331         if mobj:
3332             upload_date = mobj.group(1)
3333             # Convert timestring to a format suitable for filename
3334             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3335             upload_date = upload_date.strftime('%Y%m%d')
3336         self.report_date(upload_date)
3337
3338         # Extract uploader
3339         uploader = None
3340         pattern = r'rel\="author".*?>(.*?)</a>'
3341         mobj = re.search(pattern, webpage)
3342         if mobj:
3343             uploader = mobj.group(1)
3344         self.report_uploader(uploader)
3345
3346         # Extract title
3347         # Get the first line for title
3348         video_title = u'NA'
3349         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3350         mobj = re.search(pattern, webpage)
3351         if mobj:
3352             video_title = mobj.group(1)
3353         self.report_title(video_title)
3354
3355         # Step 2, Stimulate clicking the image box to launch video
3356         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3357         mobj = re.search(pattern, webpage)
3358         if mobj is None:
3359             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3360
3361         video_page = mobj.group(1)
3362         request = compat_urllib_request.Request(video_page)
3363         try:
3364             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3365         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3366             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3367             return
3368         self.report_extract_vid_page(video_page)
3369
3370
3371         # Extract video links on video page
3372         """Extract video links of all sizes"""
3373         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3374         mobj = re.findall(pattern, webpage)
3375         if len(mobj) == 0:
3376             self._downloader.trouble(u'ERROR: unable to extract video links')
3377
3378         # Sort in resolution
3379         links = sorted(mobj)
3380
3381         # Choose the lowest of the sort, i.e. highest resolution
3382         video_url = links[-1]
3383         # Only get the url. The resolution part in the tuple has no use anymore
3384         video_url = video_url[-1]
3385         # Treat escaped \u0026 style hex
3386         try:
3387             video_url = video_url.decode("unicode_escape")
3388         except AttributeError: # Python 3
3389             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3390
3391
3392         return [{
3393             'id':       video_id,
3394             'url':      video_url,
3395             'uploader': uploader,
3396             'upload_date':  upload_date,
3397             'title':    video_title,
3398             'ext':      video_extension,
3399         }]
3400
3401 class NBAIE(InfoExtractor):
3402     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3403     IE_NAME = u'nba'
3404
3405     def _real_extract(self, url):
3406         mobj = re.match(self._VALID_URL, url)
3407         if mobj is None:
3408             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3409             return
3410
3411         video_id = mobj.group(1)
3412         if video_id.endswith('/index.html'):
3413             video_id = video_id[:-len('/index.html')]
3414
3415         webpage = self._download_webpage(url, video_id)
3416
3417         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3418         def _findProp(rexp, default=None):
3419             m = re.search(rexp, webpage)
3420             if m:
3421                 return unescapeHTML(m.group(1))
3422             else:
3423                 return default
3424
3425         shortened_video_id = video_id.rpartition('/')[2]
3426         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3427         info = {
3428             'id': shortened_video_id,
3429             'url': video_url,
3430             'ext': 'mp4',
3431             'title': title,
3432             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3433             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3434         }
3435         return [info]
3436
3437 class JustinTVIE(InfoExtractor):
3438     """Information extractor for justin.tv and twitch.tv"""
3439     # TODO: One broadcast may be split into multiple videos. The key
3440     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3441     # starts at 1 and increases. Can we treat all parts as one video?
3442
3443     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3444         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3445     _JUSTIN_PAGE_LIMIT = 100
3446     IE_NAME = u'justin.tv'
3447
3448     def report_extraction(self, file_id):
3449         """Report information extraction."""
3450         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3451
3452     def report_download_page(self, channel, offset):
3453         """Report attempt to download a single page of videos."""
3454         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3455                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3456
3457     # Return count of items, list of *valid* items
3458     def _parse_page(self, url):
3459         try:
3460             urlh = compat_urllib_request.urlopen(url)
3461             webpage_bytes = urlh.read()
3462             webpage = webpage_bytes.decode('utf-8', 'ignore')
3463         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3464             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3465             return
3466
3467         response = json.loads(webpage)
3468         if type(response) != list:
3469             error_text = response.get('error', 'unknown error')
3470             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3471             return
3472         info = []
3473         for clip in response:
3474             video_url = clip['video_file_url']
3475             if video_url:
3476                 video_extension = os.path.splitext(video_url)[1][1:]
3477                 video_date = re.sub('-', '', clip['start_time'][:10])
3478                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3479                 video_id = clip['id']
3480                 video_title = clip.get('title', video_id)
3481                 info.append({
3482                     'id': video_id,
3483                     'url': video_url,
3484                     'title': video_title,
3485                     'uploader': clip.get('channel_name', video_uploader_id),
3486                     'uploader_id': video_uploader_id,
3487                     'upload_date': video_date,
3488                     'ext': video_extension,
3489                 })
3490         return (len(response), info)
3491
3492     def _real_extract(self, url):
3493         mobj = re.match(self._VALID_URL, url)
3494         if mobj is None:
3495             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3496             return
3497
3498         api = 'http://api.justin.tv'
3499         video_id = mobj.group(mobj.lastindex)
3500         paged = False
3501         if mobj.lastindex == 1:
3502             paged = True
3503             api += '/channel/archives/%s.json'
3504         else:
3505             api += '/broadcast/by_archive/%s.json'
3506         api = api % (video_id,)
3507
3508         self.report_extraction(video_id)
3509
3510         info = []
3511         offset = 0
3512         limit = self._JUSTIN_PAGE_LIMIT
3513         while True:
3514             if paged:
3515                 self.report_download_page(video_id, offset)
3516             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3517             page_count, page_info = self._parse_page(page_url)
3518             info.extend(page_info)
3519             if not paged or page_count != limit:
3520                 break
3521             offset += limit
3522         return info
3523
3524 class FunnyOrDieIE(InfoExtractor):
3525     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3526
3527     def _real_extract(self, url):
3528         mobj = re.match(self._VALID_URL, url)
3529         if mobj is None:
3530             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3531             return
3532
3533         video_id = mobj.group('id')
3534         webpage = self._download_webpage(url, video_id)
3535
3536         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3537         if not m:
3538             self._downloader.trouble(u'ERROR: unable to find video information')
3539         video_url = unescapeHTML(m.group('url'))
3540
3541         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3542         if not m:
3543             self._downloader.trouble(u'Cannot find video title')
3544         title = unescapeHTML(m.group('title'))
3545
3546         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3547         if m:
3548             desc = unescapeHTML(m.group('desc'))
3549         else:
3550             desc = None
3551
3552         info = {
3553             'id': video_id,
3554             'url': video_url,
3555             'ext': 'mp4',
3556             'title': title,
3557             'description': desc,
3558         }
3559         return [info]
3560
3561 class TweetReelIE(InfoExtractor):
3562     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3563
3564     def _real_extract(self, url):
3565         mobj = re.match(self._VALID_URL, url)
3566         if mobj is None:
3567             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3568             return
3569
3570         video_id = mobj.group('id')
3571         webpage = self._download_webpage(url, video_id)
3572
3573         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3574         if not m:
3575             self._downloader.trouble(u'ERROR: Cannot find status ID')
3576         status_id = m.group(1)
3577
3578         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3579         if not m:
3580             self._downloader.trouble(u'WARNING: Cannot find description')
3581         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3582
3583         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3584         if not m:
3585             self._downloader.trouble(u'ERROR: Cannot find uploader')
3586         uploader = unescapeHTML(m.group('uploader'))
3587         uploader_id = unescapeHTML(m.group('uploader_id'))
3588
3589         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3590         if not m:
3591             self._downloader.trouble(u'ERROR: Cannot find upload date')
3592         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3593
3594         title = desc
3595         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3596
3597         info = {
3598             'id': video_id,
3599             'url': video_url,
3600             'ext': 'mov',
3601             'title': title,
3602             'description': desc,
3603             'uploader': uploader,
3604             'uploader_id': uploader_id,
3605             'internal_id': status_id,
3606             'upload_date': upload_date
3607         }
3608         return [info]
3609
3610 class SteamIE(InfoExtractor):
3611     _VALID_URL = r"""http://store.steampowered.com/
3612                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3613                 (?P<gameID>\d+)/?
3614                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3615                 """
3616
3617     def suitable(self, url):
3618         """Receives a URL and returns True if suitable for this IE."""
3619         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3620
3621     def _real_extract(self, url):
3622         m = re.match(self._VALID_URL, url, re.VERBOSE)
3623         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3624         gameID = m.group('gameID')
3625         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3626         webpage = self._download_webpage(videourl, gameID)
3627         mweb = re.finditer(urlRE, webpage)
3628         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3629         titles = re.finditer(namesRE, webpage)
3630         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3631         thumbs = re.finditer(thumbsRE, webpage)
3632         videos = []
3633         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3634             video_id = vid.group('videoID')
3635             title = vtitle.group('videoName')
3636             video_url = vid.group('videoURL')
3637             video_thumb = thumb.group('thumbnail')
3638             if not video_url:
3639                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3640             info = {
3641                 'id':video_id,
3642                 'url':video_url,
3643                 'ext': 'flv',
3644                 'title': unescapeHTML(title),
3645                 'thumbnail': video_thumb
3646                   }
3647             videos.append(info)
3648         return videos
3649
3650 class UstreamIE(InfoExtractor):
3651     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3652     IE_NAME = u'ustream'
3653
3654     def _real_extract(self, url):
3655         m = re.match(self._VALID_URL, url)
3656         video_id = m.group('videoID')
3657         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3658         webpage = self._download_webpage(url, video_id)
3659         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3660         title = m.group('title')
3661         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3662         uploader = m.group('uploader')
3663         info = {
3664                 'id':video_id,
3665                 'url':video_url,
3666                 'ext': 'flv',
3667                 'title': title,
3668                 'uploader': uploader
3669                   }
3670         return [info]
3671
3672 class RBMARadioIE(InfoExtractor):
3673     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3674
3675     def _real_extract(self, url):
3676         m = re.match(self._VALID_URL, url)
3677         video_id = m.group('videoID')
3678
3679         webpage = self._download_webpage(url, video_id)
3680         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3681         if not m:
3682             raise ExtractorError(u'Cannot find metadata')
3683         json_data = m.group(1)
3684
3685         try:
3686             data = json.loads(json_data)
3687         except ValueError as e:
3688             raise ExtractorError(u'Invalid JSON: ' + str(e))
3689
3690         video_url = data['akamai_url'] + '&cbr=256'
3691         url_parts = compat_urllib_parse_urlparse(video_url)
3692         video_ext = url_parts.path.rpartition('.')[2]
3693         info = {
3694                 'id': video_id,
3695                 'url': video_url,
3696                 'ext': video_ext,
3697                 'title': data['title'],
3698                 'description': data.get('teaser_text'),
3699                 'location': data.get('country_of_origin'),
3700                 'uploader': data.get('host', {}).get('name'),
3701                 'uploader_id': data.get('host', {}).get('slug'),
3702                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3703                 'duration': data.get('duration'),
3704         }
3705         return [info]
3706
3707
3708 class YouPornIE(InfoExtractor):
3709     """Information extractor for youporn.com."""
3710     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3711
3712     def _print_formats(self, formats):
3713         """Print all available formats"""
3714         print(u'Available formats:')
3715         print(u'ext\t\tformat')
3716         print(u'---------------------------------')
3717         for format in formats:
3718             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3719
3720     def _specific(self, req_format, formats):
3721         for x in formats:
3722             if(x["format"]==req_format):
3723                 return x
3724         return None
3725
3726     def _real_extract(self, url):
3727         mobj = re.match(self._VALID_URL, url)
3728         if mobj is None:
3729             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3730             return
3731
3732         video_id = mobj.group('videoid')
3733
3734         req = compat_urllib_request.Request(url)
3735         req.add_header('Cookie', 'age_verified=1')
3736         webpage = self._download_webpage(req, video_id)
3737
3738         # Get the video title
3739         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3740         if result is None:
3741             raise ExtractorError(u'Unable to extract video title')
3742         video_title = result.group('title').strip()
3743
3744         # Get the video date
3745         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3746         if result is None:
3747             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3748             upload_date = None
3749         else:
3750             upload_date = result.group('date').strip()
3751
3752         # Get the video uploader
3753         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3754         if result is None:
3755             self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3756             video_uploader = None
3757         else:
3758             video_uploader = result.group('uploader').strip()
3759             video_uploader = clean_html( video_uploader )
3760
3761         # Get all of the formats available
3762         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3763         result = re.search(DOWNLOAD_LIST_RE, webpage)
3764         if result is None:
3765             raise ExtractorError(u'Unable to extract download list')
3766         download_list_html = result.group('download_list').strip()
3767
3768         # Get all of the links from the page
3769         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3770         links = re.findall(LINK_RE, download_list_html)
3771         if(len(links) == 0):
3772             raise ExtractorError(u'ERROR: no known formats available for video')
3773
3774         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3775
3776         formats = []
3777         for link in links:
3778
3779             # A link looks like this:
3780             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3781             # A path looks like this:
3782             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3783             video_url = unescapeHTML( link )
3784             path = compat_urllib_parse_urlparse( video_url ).path
3785             extension = os.path.splitext( path )[1][1:]
3786             format = path.split('/')[4].split('_')[:2]
3787             size = format[0]
3788             bitrate = format[1]
3789             format = "-".join( format )
3790             title = u'%s-%s-%s' % (video_title, size, bitrate)
3791
3792             formats.append({
3793                 'id': video_id,
3794                 'url': video_url,
3795                 'uploader': video_uploader,
3796                 'upload_date': upload_date,
3797                 'title': title,
3798                 'ext': extension,
3799                 'format': format,
3800                 'thumbnail': None,
3801                 'description': None,
3802                 'player_url': None
3803             })
3804
3805         if self._downloader.params.get('listformats', None):
3806             self._print_formats(formats)
3807             return
3808
3809         req_format = self._downloader.params.get('format', None)
3810         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3811
3812         if req_format is None or req_format == 'best':
3813             return [formats[0]]
3814         elif req_format == 'worst':
3815             return [formats[-1]]
3816         elif req_format in ('-1', 'all'):
3817             return formats
3818         else:
3819             format = self._specific( req_format, formats )
3820             if result is None:
3821                 self._downloader.trouble(u'ERROR: requested format not available')
3822                 return
3823             return [format]
3824
3825
3826
3827 class PornotubeIE(InfoExtractor):
3828     """Information extractor for pornotube.com."""
3829     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3830
3831     def _real_extract(self, url):
3832         mobj = re.match(self._VALID_URL, url)
3833         if mobj is None:
3834             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3835             return
3836
3837         video_id = mobj.group('videoid')
3838         video_title = mobj.group('title')
3839
3840         # Get webpage content
3841         webpage = self._download_webpage(url, video_id)
3842
3843         # Get the video URL
3844         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3845         result = re.search(VIDEO_URL_RE, webpage)
3846         if result is None:
3847             self._downloader.trouble(u'ERROR: unable to extract video url')
3848             return
3849         video_url = compat_urllib_parse.unquote(result.group('url'))
3850
3851         #Get the uploaded date
3852         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3853         result = re.search(VIDEO_UPLOADED_RE, webpage)
3854         if result is None:
3855             self._downloader.trouble(u'ERROR: unable to extract video title')
3856             return
3857         upload_date = result.group('date')
3858
3859         info = {'id': video_id,
3860                 'url': video_url,
3861                 'uploader': None,
3862                 'upload_date': upload_date,
3863                 'title': video_title,
3864                 'ext': 'flv',
3865                 'format': 'flv'}
3866
3867         return [info]
3868
3869 class YouJizzIE(InfoExtractor):
3870     """Information extractor for youjizz.com."""
3871     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3872
3873     def _real_extract(self, url):
3874         mobj = re.match(self._VALID_URL, url)
3875         if mobj is None:
3876             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3877             return
3878
3879         video_id = mobj.group('videoid')
3880
3881         # Get webpage content
3882         webpage = self._download_webpage(url, video_id)
3883
3884         # Get the video title
3885         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3886         if result is None:
3887             raise ExtractorError(u'ERROR: unable to extract video title')
3888         video_title = result.group('title').strip()
3889
3890         # Get the embed page
3891         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3892         if result is None:
3893             raise ExtractorError(u'ERROR: unable to extract embed page')
3894
3895         embed_page_url = result.group(0).strip()
3896         video_id = result.group('videoid')
3897
3898         webpage = self._download_webpage(embed_page_url, video_id)
3899
3900         # Get the video URL
3901         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3902         if result is None:
3903             raise ExtractorError(u'ERROR: unable to extract video url')
3904         video_url = result.group('source')
3905
3906         info = {'id': video_id,
3907                 'url': video_url,
3908                 'title': video_title,
3909                 'ext': 'flv',
3910                 'format': 'flv',
3911                 'player_url': embed_page_url}
3912
3913         return [info]
3914
3915 class EightTracksIE(InfoExtractor):
3916     IE_NAME = '8tracks'
3917     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3918
3919     def _real_extract(self, url):
3920         mobj = re.match(self._VALID_URL, url)
3921         if mobj is None:
3922             raise ExtractorError(u'Invalid URL: %s' % url)
3923         playlist_id = mobj.group('id')
3924
3925         webpage = self._download_webpage(url, playlist_id)
3926
3927         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3928         if not m:
3929             raise ExtractorError(u'Cannot find trax information')
3930         json_like = m.group(1)
3931         data = json.loads(json_like)
3932
3933         session = str(random.randint(0, 1000000000))
3934         mix_id = data['id']
3935         track_count = data['tracks_count']
3936         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3937         next_url = first_url
3938         res = []
3939         for i in itertools.count():
3940             api_json = self._download_webpage(next_url, playlist_id,
3941                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3942                 errnote=u'Failed to download song information')
3943             api_data = json.loads(api_json)
3944             track_data = api_data[u'set']['track']
3945             info = {
3946                 'id': track_data['id'],
3947                 'url': track_data['track_file_stream_url'],
3948                 'title': track_data['performer'] + u' - ' + track_data['name'],
3949                 'raw_title': track_data['name'],
3950                 'uploader_id': data['user']['login'],
3951                 'ext': 'm4a',
3952             }
3953             res.append(info)
3954             if api_data['set']['at_last_track']:
3955                 break
3956             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3957         return res
3958
3959 class KeekIE(InfoExtractor):
3960     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3961     IE_NAME = u'keek'
3962
3963     def _real_extract(self, url):
3964         m = re.match(self._VALID_URL, url)
3965         video_id = m.group('videoID')
3966         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3967         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3968         webpage = self._download_webpage(url, video_id)
3969         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3970         title = unescapeHTML(m.group('title'))
3971         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3972         uploader = unescapeHTML(m.group('uploader'))
3973         info = {
3974                 'id':video_id,
3975                 'url':video_url,
3976                 'ext': 'mp4',
3977                 'title': title,
3978                 'thumbnail': thumbnail,
3979                 'uploader': uploader
3980         }
3981         return [info]
3982
3983 class TEDIE(InfoExtractor):
3984     _VALID_URL=r'''http://www.ted.com/
3985                    (
3986                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3987                         |
3988                         ((?P<type_talk>talks)) # We have a simple talk
3989                    )
3990                    /(?P<name>\w+) # Here goes the name and then ".html"
3991                    '''
3992
3993     def suitable(self, url):
3994         """Receives a URL and returns True if suitable for this IE."""
3995         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3996
3997     def _real_extract(self, url):
3998         m=re.match(self._VALID_URL, url, re.VERBOSE)
3999         if m.group('type_talk'):
4000             return [self._talk_info(url)]
4001         else :
4002             playlist_id=m.group('playlist_id')
4003             name=m.group('name')
4004             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4005             return self._playlist_videos_info(url,name,playlist_id)
4006
4007     def _talk_video_link(self,mediaSlug):
4008         '''Returns the video link for that mediaSlug'''
4009         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4010
4011     def _playlist_videos_info(self,url,name,playlist_id=0):
4012         '''Returns the videos of the playlist'''
4013         video_RE=r'''
4014                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4015                      ([.\s]*?)data-playlist_item_id="(\d+)"
4016                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4017                      '''
4018         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4019         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4020         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4021         m_names=re.finditer(video_name_RE,webpage)
4022         info=[]
4023         for m_video, m_name in zip(m_videos,m_names):
4024             video_id=m_video.group('video_id')
4025             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4026             info.append(self._talk_info(talk_url,video_id))
4027         return info
4028
4029     def _talk_info(self, url, video_id=0):
4030         """Return the video for the talk in the url"""
4031         m=re.match(self._VALID_URL, url,re.VERBOSE)
4032         videoName=m.group('name')
4033         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4034         # If the url includes the language we get the title translated
4035         title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4036         title=re.search(title_RE, webpage).group('title')
4037         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4038                         "id":(?P<videoID>[\d]+).*?
4039                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4040         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4041         thumb_match=re.search(thumb_RE,webpage)
4042         info_match=re.search(info_RE,webpage,re.VERBOSE)
4043         video_id=info_match.group('videoID')
4044         mediaSlug=info_match.group('mediaSlug')
4045         video_url=self._talk_video_link(mediaSlug)
4046         info = {
4047                 'id': video_id,
4048                 'url': video_url,
4049                 'ext': 'mp4',
4050                 'title': title,
4051                 'thumbnail': thumb_match.group('thumbnail')
4052                 }
4053         return info
4054
4055 class MySpassIE(InfoExtractor):
4056     _VALID_URL = r'http://www.myspass.de/.*'
4057
4058     def _real_extract(self, url):
4059         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4060
4061         # video id is the last path element of the URL
4062         # usually there is a trailing slash, so also try the second but last
4063         url_path = compat_urllib_parse_urlparse(url).path
4064         url_parent_path, video_id = os.path.split(url_path)
4065         if not video_id:
4066             _, video_id = os.path.split(url_parent_path)
4067
4068         # get metadata
4069         metadata_url = META_DATA_URL_TEMPLATE % video_id
4070         metadata_text = self._download_webpage(metadata_url, video_id)
4071         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4072
4073         # extract values from metadata
4074         url_flv_el = metadata.find('url_flv')
4075         if url_flv_el is None:
4076             self._downloader.trouble(u'ERROR: unable to extract download url')
4077             return
4078         video_url = url_flv_el.text
4079         extension = os.path.splitext(video_url)[1][1:]
4080         title_el = metadata.find('title')
4081         if title_el is None:
4082             self._downloader.trouble(u'ERROR: unable to extract title')
4083             return
4084         title = title_el.text
4085         format_id_el = metadata.find('format_id')
4086         if format_id_el is None:
4087             format = ext
4088         else:
4089             format = format_id_el.text
4090         description_el = metadata.find('description')
4091         if description_el is not None:
4092             description = description_el.text
4093         else:
4094             description = None
4095         imagePreview_el = metadata.find('imagePreview')
4096         if imagePreview_el is not None:
4097             thumbnail = imagePreview_el.text
4098         else:
4099             thumbnail = None
4100         info = {
4101             'id': video_id,
4102             'url': video_url,
4103             'title': title,
4104             'ext': extension,
4105             'format': format,
4106             'thumbnail': thumbnail,
4107             'description': description
4108         }
4109         return [info]
4110
4111 def gen_extractors():
4112     """ Return a list of an instance of every supported extractor.
4113     The order does matter; the first extractor matched is the one handling the URL.
4114     """
4115     return [
4116         YoutubePlaylistIE(),
4117         YoutubeChannelIE(),
4118         YoutubeUserIE(),
4119         YoutubeSearchIE(),
4120         YoutubeIE(),
4121         MetacafeIE(),
4122         DailymotionIE(),
4123         GoogleSearchIE(),
4124         PhotobucketIE(),
4125         YahooIE(),
4126         YahooSearchIE(),
4127         DepositFilesIE(),
4128         FacebookIE(),
4129         BlipTVUserIE(),
4130         BlipTVIE(),
4131         VimeoIE(),
4132         MyVideoIE(),
4133         ComedyCentralIE(),
4134         EscapistIE(),
4135         CollegeHumorIE(),
4136         XVideosIE(),
4137         SoundcloudIE(),
4138         InfoQIE(),
4139         MixcloudIE(),
4140         StanfordOpenClassroomIE(),
4141         MTVIE(),
4142         YoukuIE(),
4143         XNXXIE(),
4144         YouJizzIE(),
4145         PornotubeIE(),
4146         YouPornIE(),
4147         GooglePlusIE(),
4148         ArteTvIE(),
4149         NBAIE(),
4150         JustinTVIE(),
4151         FunnyOrDieIE(),
4152         TweetReelIE(),
4153         SteamIE(),
4154         UstreamIE(),
4155         RBMARadioIE(),
4156         EightTracksIE(),
4157         KeekIE(),
4158         TEDIE(),
4159         MySpassIE(),
4160         GenericIE()
4161     ]
4162
4163