youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import datetime
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import email.utils
  13 import xml.etree.ElementTree
  14 import random
  15 import math
  16
  17 from .utils import *
  18
  19
  20 class InfoExtractor(object):
  21     """Information Extractor class.
  22
  23     Information extractors are the classes that, given a URL, extract
  24     information about the video (or videos) the URL refers to. This
  25     information includes the real video URL, the video title, author and
  26     others. The information is stored in a dictionary which is then
  27     passed to the FileDownloader. The FileDownloader processes this
  28     information possibly downloading the video to the file system, among
  29     other possible outcomes.
  30
  31     The dictionaries must include the following fields:
  32
  33     id:             Video identifier.
  34     url:            Final video URL.
  35     uploader:       Nickname of the video uploader, unescaped.
  36     upload_date:    Video upload date (YYYYMMDD).
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     player_url:     SWF Player URL (used for rtmpdump).
  46     subtitles:      The .srt file contents.
  47     urlhandle:      [internal] The urlHandle to be used to download the file,
  48                     like returned by urllib.request.urlopen
  49
  50     The fields should all be Unicode strings.
  51
  52     Subclasses of this one should re-define the _real_initialize() and
  53     _real_extract() methods and define a _VALID_URL regexp.
  54     Probably, they should also be added to the list of extractors.
  55
  56     _real_extract() must return a *list* of information dictionaries as
  57     described above.
  58
  59     Finally, the _WORKING attribute should be set to False for broken IEs
  60     in order to warn the users and skip the tests.
  61     """
  62
  63     _ready = False
  64     _downloader = None
  65     _WORKING = True
  66
  67     def __init__(self, downloader=None):
  68         """Constructor. Receives an optional downloader."""
  69         self._ready = False
  70         self.set_downloader(downloader)
  71
  72     def suitable(self, url):
  73         """Receives a URL and returns True if suitable for this IE."""
  74         return re.match(self._VALID_URL, url) is not None
  75
  76     def working(self):
  77         """Getter method for _WORKING."""
  78         return self._WORKING
  79
  80     def initialize(self):
  81         """Initializes an instance (authentication, etc)."""
  82         if not self._ready:
  83             self._real_initialize()
  84             self._ready = True
  85
  86     def extract(self, url):
  87         """Extracts URL information and returns it in list of dicts."""
  88         self.initialize()
  89         return self._real_extract(url)
  90
  91     def set_downloader(self, downloader):
  92         """Sets the downloader for this IE."""
  93         self._downloader = downloader
  94
  95     def _real_initialize(self):
  96         """Real initialization process. Redefine in subclasses."""
  97         pass
  98
  99     def _real_extract(self, url):
 100         """Real extraction process. Redefine in subclasses."""
 101         pass
 102
 103
 104 class YoutubeIE(InfoExtractor):
 105     """Information extractor for youtube.com."""
 106
 107     _VALID_URL = r"""^
 108                      (
 109                          (?:https?://)?                                       # http(s):// (optional)
 110                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 111                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 112                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 113                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 114                          (?:                                                  # the various things that can precede the ID:
 115                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 116                              |(?:                                             # or the v= param in all its forms
 117                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 118                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 119                                  (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 120                                  v=
 121                              )
 122                          )?                                                   # optional -> youtube.com/xxxx is OK
 123                      )?                                                       # all until now is optional -> you can pass the naked ID
 124                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 125                      (?(1).+)?                                                # if we found the ID, everything can follow
 126                      $"""
 127     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 128     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 129     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 130     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 131     _NETRC_MACHINE = 'youtube'
 132     # Listed in order of quality
 133     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 134     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 135     _video_extensions = {
 136         '13': '3gp',
 137         '17': 'mp4',
 138         '18': 'mp4',
 139         '22': 'mp4',
 140         '37': 'mp4',
 141         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 142         '43': 'webm',
 143         '44': 'webm',
 144         '45': 'webm',
 145         '46': 'webm',
 146     }
 147     _video_dimensions = {
 148         '5': '240x400',
 149         '6': '???',
 150         '13': '???',
 151         '17': '144x176',
 152         '18': '360x640',
 153         '22': '720x1280',
 154         '34': '360x640',
 155         '35': '480x854',
 156         '37': '1080x1920',
 157         '38': '3072x4096',
 158         '43': '360x640',
 159         '44': '480x854',
 160         '45': '720x1280',
 161         '46': '1080x1920',
 162     }
 163     IE_NAME = u'youtube'
 164
 165     def suitable(self, url):
 166         """Receives a URL and returns True if suitable for this IE."""
 167         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 168
 169     def report_lang(self):
 170         """Report attempt to set language."""
 171         self._downloader.to_screen(u'[youtube] Setting language')
 172
 173     def report_login(self):
 174         """Report attempt to log in."""
 175         self._downloader.to_screen(u'[youtube] Logging in')
 176
 177     def report_age_confirmation(self):
 178         """Report attempt to confirm age."""
 179         self._downloader.to_screen(u'[youtube] Confirming age')
 180
 181     def report_video_webpage_download(self, video_id):
 182         """Report attempt to download video webpage."""
 183         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 184
 185     def report_video_info_webpage_download(self, video_id):
 186         """Report attempt to download video info webpage."""
 187         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 188
 189     def report_video_subtitles_download(self, video_id):
 190         """Report attempt to download video info webpage."""
 191         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 192
 193     def report_information_extraction(self, video_id):
 194         """Report attempt to extract video information."""
 195         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 196
 197     def report_unavailable_format(self, video_id, format):
 198         """Report extracted video URL."""
 199         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 200
 201     def report_rtmp_download(self):
 202         """Indicate the download will use the RTMP protocol."""
 203         self._downloader.to_screen(u'[youtube] RTMP download detected')
 204
 205     def _closed_captions_xml_to_srt(self, xml_string):
 206         srt = ''
 207         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 208         # TODO parse xml instead of regex
 209         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 210             if not dur: dur = '4'
 211             start = float(start)
 212             end = start + float(dur)
 213             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 214             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 215             caption = unescapeHTML(caption)
 216             caption = unescapeHTML(caption) # double cycle, intentional
 217             srt += str(n+1) + '\n'
 218             srt += start + ' --> ' + end + '\n'
 219             srt += caption + '\n\n'
 220         return srt
 221
 222     def _print_formats(self, formats):
 223         print('Available formats:')
 224         for x in formats:
 225             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 226
 227     def _real_initialize(self):
 228         if self._downloader is None:
 229             return
 230
 231         username = None
 232         password = None
 233         downloader_params = self._downloader.params
 234
 235         # Attempt to use provided username and password or .netrc data
 236         if downloader_params.get('username', None) is not None:
 237             username = downloader_params['username']
 238             password = downloader_params['password']
 239         elif downloader_params.get('usenetrc', False):
 240             try:
 241                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 242                 if info is not None:
 243                     username = info[0]
 244                     password = info[2]
 245                 else:
 246                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 247             except (IOError, netrc.NetrcParseError) as err:
 248                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 249                 return
 250
 251         # Set language
 252         request = compat_urllib_request.Request(self._LANG_URL)
 253         try:
 254             self.report_lang()
 255             compat_urllib_request.urlopen(request).read()
 256         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 257             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 258             return
 259
 260         # No authentication to be performed
 261         if username is None:
 262             return
 263
 264         # Log in
 265         login_form = {
 266                 'current_form': 'loginForm',
 267                 'next':     '/',
 268                 'action_login': 'Log In',
 269                 'username': username,
 270                 'password': password,
 271                 }
 272         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 273         try:
 274             self.report_login()
 275             login_results = compat_urllib_request.urlopen(request).read()
 276             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 277                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 278                 return
 279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 280             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 281             return
 282
 283         # Confirm age
 284         age_form = {
 285                 'next_url':     '/',
 286                 'action_confirm':   'Confirm',
 287                 }
 288         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 289         try:
 290             self.report_age_confirmation()
 291             age_results = compat_urllib_request.urlopen(request).read()
 292         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 293             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 294             return
 295
 296     def _real_extract(self, url):
 297         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 298         mobj = re.search(self._NEXT_URL_RE, url)
 299         if mobj:
 300             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 301
 302         # Extract video id from URL
 303         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 304         if mobj is None:
 305             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 306             return
 307         video_id = mobj.group(2)
 308
 309         # Get video webpage
 310         self.report_video_webpage_download(video_id)
 311         request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 312         try:
 313             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 314         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 315             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 316             return
 317
 318         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 319
 320         # Attempt to extract SWF player URL
 321         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 322         if mobj is not None:
 323             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 324         else:
 325             player_url = None
 326
 327         # Get video info
 328         self.report_video_info_webpage_download(video_id)
 329         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 330             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 331                     % (video_id, el_type))
 332             request = compat_urllib_request.Request(video_info_url)
 333             try:
 334                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 335                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 336                 video_info = compat_parse_qs(video_info_webpage)
 337                 if 'token' in video_info:
 338                     break
 339             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 340                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 341                 return
 342         if 'token' not in video_info:
 343             if 'reason' in video_info:
 344                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 345             else:
 346                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 347             return
 348
 349         # Check for "rental" videos
 350         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 351             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 352             return
 353
 354         # Start extracting information
 355         self.report_information_extraction(video_id)
 356
 357         # uploader
 358         if 'author' not in video_info:
 359             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 360             return
 361         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 362
 363         # title
 364         if 'title' not in video_info:
 365             self._downloader.trouble(u'ERROR: unable to extract video title')
 366             return
 367         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 368
 369         # thumbnail image
 370         if 'thumbnail_url' not in video_info:
 371             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 372             video_thumbnail = ''
 373         else:   # don't panic if we can't find it
 374             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 375
 376         # upload date
 377         upload_date = None
 378         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 379         if mobj is not None:
 380             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 381             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 382             for expression in format_expressions:
 383                 try:
 384                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 385                 except:
 386                     pass
 387
 388         # description
 389         video_description = get_element_by_id("eow-description", video_webpage)
 390         if video_description:
 391             video_description = clean_html(video_description)
 392         else:
 393             video_description = ''
 394
 395         # closed captions
 396         video_subtitles = None
 397         if self._downloader.params.get('writesubtitles', False):
 398             try:
 399                 self.report_video_subtitles_download(video_id)
 400                 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 401                 try:
 402                     srt_list = compat_urllib_request.urlopen(request).read()
 403                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 404                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
 405                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 406                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 407                 if not srt_lang_list:
 408                     raise Trouble(u'WARNING: video has no closed captions')
 409                 if self._downloader.params.get('subtitleslang', False):
 410                     srt_lang = self._downloader.params.get('subtitleslang')
 411                 elif 'en' in srt_lang_list:
 412                     srt_lang = 'en'
 413                 else:
 414                     srt_lang = srt_lang_list.keys()[0]
 415                 if not srt_lang in srt_lang_list:
 416                     raise Trouble(u'WARNING: no closed captions found in the specified language')
 417                 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 418                 try:
 419                     srt_xml = compat_urllib_request.urlopen(request).read()
 420                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 421                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
 422                 if not srt_xml:
 423                     raise Trouble(u'WARNING: unable to download video subtitles')
 424                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 425             except Trouble as trouble:
 426                 self._downloader.trouble(trouble[0])
 427
 428         if 'length_seconds' not in video_info:
 429             self._downloader.trouble(u'WARNING: unable to extract video duration')
 430             video_duration = ''
 431         else:
 432             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 433
 434         # token
 435         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 436
 437         # Decide which formats to download
 438         req_format = self._downloader.params.get('format', None)
 439
 440         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 441             self.report_rtmp_download()
 442             video_url_list = [(None, video_info['conn'][0])]
 443         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 444             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 445             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 446             url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 447             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 448
 449             format_limit = self._downloader.params.get('format_limit', None)
 450             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 451             if format_limit is not None and format_limit in available_formats:
 452                 format_list = available_formats[available_formats.index(format_limit):]
 453             else:
 454                 format_list = available_formats
 455             existing_formats = [x for x in format_list if x in url_map]
 456             if len(existing_formats) == 0:
 457                 self._downloader.trouble(u'ERROR: no known formats available for video')
 458                 return
 459             if self._downloader.params.get('listformats', None):
 460                 self._print_formats(existing_formats)
 461                 return
 462             if req_format is None or req_format == 'best':
 463                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 464             elif req_format == 'worst':
 465                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 466             elif req_format in ('-1', 'all'):
 467                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 468             else:
 469                 # Specific formats. We pick the first in a slash-delimeted sequence.
 470                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 471                 req_formats = req_format.split('/')
 472                 video_url_list = None
 473                 for rf in req_formats:
 474                     if rf in url_map:
 475                         video_url_list = [(rf, url_map[rf])]
 476                         break
 477                 if video_url_list is None:
 478                     self._downloader.trouble(u'ERROR: requested format not available')
 479                     return
 480         else:
 481             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 482             return
 483
 484         results = []
 485         for format_param, video_real_url in video_url_list:
 486             # Extension
 487             video_extension = self._video_extensions.get(format_param, 'flv')
 488
 489             video_format = '{} - {}'.format(format_param if format_param else video_extension,
 490                                             self._video_dimensions.get(format_param, '???'))
 491
 492             results.append({
 493                 'id':       video_id,
 494                 'url':      video_real_url,
 495                 'uploader': video_uploader,
 496                 'upload_date':  upload_date,
 497                 'title':    video_title,
 498                 'ext':      video_extension,
 499                 'format':   video_format,
 500                 'thumbnail':    video_thumbnail,
 501                 'description':  video_description,
 502                 'player_url':   player_url,
 503                 'subtitles':    video_subtitles,
 504                 'duration':     video_duration
 505             })
 506         return results
 507
 508
 509 class MetacafeIE(InfoExtractor):
 510     """Information Extractor for metacafe.com."""
 511
 512     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 513     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 514     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 515     IE_NAME = u'metacafe'
 516
 517     def __init__(self, downloader=None):
 518         InfoExtractor.__init__(self, downloader)
 519
 520     def report_disclaimer(self):
 521         """Report disclaimer retrieval."""
 522         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 523
 524     def report_age_confirmation(self):
 525         """Report attempt to confirm age."""
 526         self._downloader.to_screen(u'[metacafe] Confirming age')
 527
 528     def report_download_webpage(self, video_id):
 529         """Report webpage download."""
 530         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 531
 532     def report_extraction(self, video_id):
 533         """Report information extraction."""
 534         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 535
 536     def _real_initialize(self):
 537         # Retrieve disclaimer
 538         request = compat_urllib_request.Request(self._DISCLAIMER)
 539         try:
 540             self.report_disclaimer()
 541             disclaimer = compat_urllib_request.urlopen(request).read()
 542         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 543             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 544             return
 545
 546         # Confirm age
 547         disclaimer_form = {
 548             'filters': '0',
 549             'submit': "Continue - I'm over 18",
 550             }
 551         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 552         try:
 553             self.report_age_confirmation()
 554             disclaimer = compat_urllib_request.urlopen(request).read()
 555         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 556             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 557             return
 558
 559     def _real_extract(self, url):
 560         # Extract id and simplified title from URL
 561         mobj = re.match(self._VALID_URL, url)
 562         if mobj is None:
 563             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 564             return
 565
 566         video_id = mobj.group(1)
 567
 568         # Check if video comes from YouTube
 569         mobj2 = re.match(r'^yt-(.*)$', video_id)
 570         if mobj2 is not None:
 571             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 572             return
 573
 574         # Retrieve video webpage to extract further information
 575         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 576         try:
 577             self.report_download_webpage(video_id)
 578             webpage = compat_urllib_request.urlopen(request).read()
 579         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 580             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 581             return
 582
 583         # Extract URL, uploader and title from webpage
 584         self.report_extraction(video_id)
 585         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 586         if mobj is not None:
 587             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 588             video_extension = mediaURL[-3:]
 589
 590             # Extract gdaKey if available
 591             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 592             if mobj is None:
 593                 video_url = mediaURL
 594             else:
 595                 gdaKey = mobj.group(1)
 596                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 597         else:
 598             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 599             if mobj is None:
 600                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 601                 return
 602             vardict = compat_parse_qs(mobj.group(1))
 603             if 'mediaData' not in vardict:
 604                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 605                 return
 606             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 607             if mobj is None:
 608                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 609                 return
 610             mediaURL = mobj.group(1).replace('\\/', '/')
 611             video_extension = mediaURL[-3:]
 612             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 613
 614         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 615         if mobj is None:
 616             self._downloader.trouble(u'ERROR: unable to extract title')
 617             return
 618         video_title = mobj.group(1).decode('utf-8')
 619
 620         mobj = re.search(r'submitter=(.*?);', webpage)
 621         if mobj is None:
 622             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 623             return
 624         video_uploader = mobj.group(1)
 625
 626         return [{
 627             'id':       video_id.decode('utf-8'),
 628             'url':      video_url.decode('utf-8'),
 629             'uploader': video_uploader.decode('utf-8'),
 630             'upload_date':  None,
 631             'title':    video_title,
 632             'ext':      video_extension.decode('utf-8'),
 633         }]
 634
 635
 636 class DailymotionIE(InfoExtractor):
 637     """Information Extractor for Dailymotion"""
 638
 639     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 640     IE_NAME = u'dailymotion'
 641
 642     def __init__(self, downloader=None):
 643         InfoExtractor.__init__(self, downloader)
 644
 645     def report_download_webpage(self, video_id):
 646         """Report webpage download."""
 647         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 648
 649     def report_extraction(self, video_id):
 650         """Report information extraction."""
 651         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 652
 653     def _real_extract(self, url):
 654         # Extract id and simplified title from URL
 655         mobj = re.match(self._VALID_URL, url)
 656         if mobj is None:
 657             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 658             return
 659
 660         video_id = mobj.group(1).split('_')[0].split('?')[0]
 661
 662         video_extension = 'mp4'
 663
 664         # Retrieve video webpage to extract further information
 665         request = compat_urllib_request.Request(url)
 666         request.add_header('Cookie', 'family_filter=off')
 667         try:
 668             self.report_download_webpage(video_id)
 669             webpage = compat_urllib_request.urlopen(request).read()
 670         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 671             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 672             return
 673
 674         # Extract URL, uploader and title from webpage
 675         self.report_extraction(video_id)
 676         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 677         if mobj is None:
 678             self._downloader.trouble(u'ERROR: unable to extract media URL')
 679             return
 680         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 681
 682         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 683             if key in flashvars:
 684                 max_quality = key
 685                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 686                 break
 687         else:
 688             self._downloader.trouble(u'ERROR: unable to extract video URL')
 689             return
 690
 691         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 692         if mobj is None:
 693             self._downloader.trouble(u'ERROR: unable to extract video URL')
 694             return
 695
 696         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 697
 698         # TODO: support choosing qualities
 699
 700         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 701         if mobj is None:
 702             self._downloader.trouble(u'ERROR: unable to extract title')
 703             return
 704         video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 705
 706         video_uploader = None
 707         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 708         if mobj is None:
 709             # lookin for official user
 710             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 711             if mobj_official is None:
 712                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 713             else:
 714                 video_uploader = mobj_official.group(1)
 715         else:
 716             video_uploader = mobj.group(1)
 717
 718         video_upload_date = None
 719         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 720         if mobj is not None:
 721             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 722
 723         return [{
 724             'id':       video_id.decode('utf-8'),
 725             'url':      video_url.decode('utf-8'),
 726             'uploader': video_uploader.decode('utf-8'),
 727             'upload_date':  video_upload_date,
 728             'title':    video_title,
 729             'ext':      video_extension.decode('utf-8'),
 730         }]
 731
 732
 733 class GoogleIE(InfoExtractor):
 734     """Information extractor for video.google.com."""
 735
 736     _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 737     IE_NAME = u'video.google'
 738
 739     def __init__(self, downloader=None):
 740         InfoExtractor.__init__(self, downloader)
 741
 742     def report_download_webpage(self, video_id):
 743         """Report webpage download."""
 744         self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 745
 746     def report_extraction(self, video_id):
 747         """Report information extraction."""
 748         self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 749
 750     def _real_extract(self, url):
 751         # Extract id from URL
 752         mobj = re.match(self._VALID_URL, url)
 753         if mobj is None:
 754             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 755             return
 756
 757         video_id = mobj.group(1)
 758
 759         video_extension = 'mp4'
 760
 761         # Retrieve video webpage to extract further information
 762         request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 763         try:
 764             self.report_download_webpage(video_id)
 765             webpage = compat_urllib_request.urlopen(request).read()
 766         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 767             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 768             return
 769
 770         # Extract URL, uploader, and title from webpage
 771         self.report_extraction(video_id)
 772         mobj = re.search(r"download_url:'([^']+)'", webpage)
 773         if mobj is None:
 774             video_extension = 'flv'
 775             mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 776         if mobj is None:
 777             self._downloader.trouble(u'ERROR: unable to extract media URL')
 778             return
 779         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 780         mediaURL = mediaURL.replace('\\x3d', '\x3d')
 781         mediaURL = mediaURL.replace('\\x26', '\x26')
 782
 783         video_url = mediaURL
 784
 785         mobj = re.search(r'<title>(.*)</title>', webpage)
 786         if mobj is None:
 787             self._downloader.trouble(u'ERROR: unable to extract title')
 788             return
 789         video_title = mobj.group(1).decode('utf-8')
 790
 791         # Extract video description
 792         mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 793         if mobj is None:
 794             self._downloader.trouble(u'ERROR: unable to extract video description')
 795             return
 796         video_description = mobj.group(1).decode('utf-8')
 797         if not video_description:
 798             video_description = 'No description available.'
 799
 800         # Extract video thumbnail
 801         if self._downloader.params.get('forcethumbnail', False):
 802             request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 803             try:
 804                 webpage = compat_urllib_request.urlopen(request).read()
 805             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 806                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 807                 return
 808             mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 809             if mobj is None:
 810                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 811                 return
 812             video_thumbnail = mobj.group(1)
 813         else:   # we need something to pass to process_info
 814             video_thumbnail = ''
 815
 816         return [{
 817             'id':       video_id.decode('utf-8'),
 818             'url':      video_url.decode('utf-8'),
 819             'uploader': None,
 820             'upload_date':  None,
 821             'title':    video_title,
 822             'ext':      video_extension.decode('utf-8'),
 823         }]
 824
 825
 826 class PhotobucketIE(InfoExtractor):
 827     """Information extractor for photobucket.com."""
 828
 829     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 830     IE_NAME = u'photobucket'
 831
 832     def __init__(self, downloader=None):
 833         InfoExtractor.__init__(self, downloader)
 834
 835     def report_download_webpage(self, video_id):
 836         """Report webpage download."""
 837         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 838
 839     def report_extraction(self, video_id):
 840         """Report information extraction."""
 841         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 842
 843     def _real_extract(self, url):
 844         # Extract id from URL
 845         mobj = re.match(self._VALID_URL, url)
 846         if mobj is None:
 847             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 848             return
 849
 850         video_id = mobj.group(1)
 851
 852         video_extension = 'flv'
 853
 854         # Retrieve video webpage to extract further information
 855         request = compat_urllib_request.Request(url)
 856         try:
 857             self.report_download_webpage(video_id)
 858             webpage = compat_urllib_request.urlopen(request).read()
 859         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 860             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 861             return
 862
 863         # Extract URL, uploader, and title from webpage
 864         self.report_extraction(video_id)
 865         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 866         if mobj is None:
 867             self._downloader.trouble(u'ERROR: unable to extract media URL')
 868             return
 869         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 870
 871         video_url = mediaURL
 872
 873         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 874         if mobj is None:
 875             self._downloader.trouble(u'ERROR: unable to extract title')
 876             return
 877         video_title = mobj.group(1).decode('utf-8')
 878
 879         video_uploader = mobj.group(2).decode('utf-8')
 880
 881         return [{
 882             'id':       video_id.decode('utf-8'),
 883             'url':      video_url.decode('utf-8'),
 884             'uploader': video_uploader,
 885             'upload_date':  None,
 886             'title':    video_title,
 887             'ext':      video_extension.decode('utf-8'),
 888         }]
 889
 890
 891 class YahooIE(InfoExtractor):
 892     """Information extractor for video.yahoo.com."""
 893
 894     # _VALID_URL matches all Yahoo! Video URLs
 895     # _VPAGE_URL matches only the extractable '/watch/' URLs
 896     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 897     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 898     IE_NAME = u'video.yahoo'
 899
 900     def __init__(self, downloader=None):
 901         InfoExtractor.__init__(self, downloader)
 902
 903     def report_download_webpage(self, video_id):
 904         """Report webpage download."""
 905         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 906
 907     def report_extraction(self, video_id):
 908         """Report information extraction."""
 909         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 910
 911     def _real_extract(self, url, new_video=True):
 912         # Extract ID from URL
 913         mobj = re.match(self._VALID_URL, url)
 914         if mobj is None:
 915             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 916             return
 917
 918         video_id = mobj.group(2)
 919         video_extension = 'flv'
 920
 921         # Rewrite valid but non-extractable URLs as
 922         # extractable English language /watch/ URLs
 923         if re.match(self._VPAGE_URL, url) is None:
 924             request = compat_urllib_request.Request(url)
 925             try:
 926                 webpage = compat_urllib_request.urlopen(request).read()
 927             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 928                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 929                 return
 930
 931             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 932             if mobj is None:
 933                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 934                 return
 935             yahoo_id = mobj.group(1)
 936
 937             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 938             if mobj is None:
 939                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 940                 return
 941             yahoo_vid = mobj.group(1)
 942
 943             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 944             return self._real_extract(url, new_video=False)
 945
 946         # Retrieve video webpage to extract further information
 947         request = compat_urllib_request.Request(url)
 948         try:
 949             self.report_download_webpage(video_id)
 950             webpage = compat_urllib_request.urlopen(request).read()
 951         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 952             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 953             return
 954
 955         # Extract uploader and title from webpage
 956         self.report_extraction(video_id)
 957         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 958         if mobj is None:
 959             self._downloader.trouble(u'ERROR: unable to extract video title')
 960             return
 961         video_title = mobj.group(1).decode('utf-8')
 962
 963         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 964         if mobj is None:
 965             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 966             return
 967         video_uploader = mobj.group(1).decode('utf-8')
 968
 969         # Extract video thumbnail
 970         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 971         if mobj is None:
 972             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 973             return
 974         video_thumbnail = mobj.group(1).decode('utf-8')
 975
 976         # Extract video description
 977         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 978         if mobj is None:
 979             self._downloader.trouble(u'ERROR: unable to extract video description')
 980             return
 981         video_description = mobj.group(1).decode('utf-8')
 982         if not video_description:
 983             video_description = 'No description available.'
 984
 985         # Extract video height and width
 986         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 987         if mobj is None:
 988             self._downloader.trouble(u'ERROR: unable to extract video height')
 989             return
 990         yv_video_height = mobj.group(1)
 991
 992         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 993         if mobj is None:
 994             self._downloader.trouble(u'ERROR: unable to extract video width')
 995             return
 996         yv_video_width = mobj.group(1)
 997
 998         # Retrieve video playlist to extract media URL
 999         # I'm not completely sure what all these options are, but we
1000         # seem to need most of them, otherwise the server sends a 401.
1001         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1002         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1003         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1004                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1005                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1006         try:
1007             self.report_download_webpage(video_id)
1008             webpage = compat_urllib_request.urlopen(request).read()
1009         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1010             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1011             return
1012
1013         # Extract media URL from playlist XML
1014         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1015         if mobj is None:
1016             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1017             return
1018         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1019         video_url = unescapeHTML(video_url)
1020
1021         return [{
1022             'id':       video_id.decode('utf-8'),
1023             'url':      video_url,
1024             'uploader': video_uploader,
1025             'upload_date':  None,
1026             'title':    video_title,
1027             'ext':      video_extension.decode('utf-8'),
1028             'thumbnail':    video_thumbnail.decode('utf-8'),
1029             'description':  video_description,
1030         }]
1031
1032
1033 class VimeoIE(InfoExtractor):
1034     """Information extractor for vimeo.com."""
1035
1036     # _VALID_URL matches Vimeo URLs
1037     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1038     IE_NAME = u'vimeo'
1039
1040     def __init__(self, downloader=None):
1041         InfoExtractor.__init__(self, downloader)
1042
1043     def report_download_webpage(self, video_id):
1044         """Report webpage download."""
1045         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1046
1047     def report_extraction(self, video_id):
1048         """Report information extraction."""
1049         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1050
1051     def _real_extract(self, url, new_video=True):
1052         # Extract ID from URL
1053         mobj = re.match(self._VALID_URL, url)
1054         if mobj is None:
1055             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1056             return
1057
1058         video_id = mobj.group(1)
1059
1060         # Retrieve video webpage to extract further information
1061         request = compat_urllib_request.Request(url, None, std_headers)
1062         try:
1063             self.report_download_webpage(video_id)
1064             webpage = compat_urllib_request.urlopen(request).read()
1065         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1066             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1067             return
1068
1069         # Now we begin extracting as much information as we can from what we
1070         # retrieved. First we extract the information common to all extractors,
1071         # and latter we extract those that are Vimeo specific.
1072         self.report_extraction(video_id)
1073
1074         # Extract the config JSON
1075         config = webpage.split(' = {config:')[1].split(',assets:')[0]
1076         try:
1077             config = json.loads(config)
1078         except:
1079             self._downloader.trouble(u'ERROR: unable to extract info section')
1080             return
1081
1082         # Extract title
1083         video_title = config["video"]["title"]
1084
1085         # Extract uploader
1086         video_uploader = config["video"]["owner"]["name"]
1087
1088         # Extract video thumbnail
1089         video_thumbnail = config["video"]["thumbnail"]
1090
1091         # Extract video description
1092         video_description = get_element_by_id("description", webpage.decode('utf8'))
1093         if video_description: video_description = clean_html(video_description)
1094         else: video_description = ''
1095
1096         # Extract upload date
1097         video_upload_date = None
1098         mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1099         if mobj is not None:
1100             video_upload_date = mobj.group(1)
1101
1102         # Vimeo specific: extract request signature and timestamp
1103         sig = config['request']['signature']
1104         timestamp = config['request']['timestamp']
1105
1106         # Vimeo specific: extract video codec and quality information
1107         # First consider quality, then codecs, then take everything
1108         # TODO bind to format param
1109         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1110         files = { 'hd': [], 'sd': [], 'other': []}
1111         for codec_name, codec_extension in codecs:
1112             if codec_name in config["video"]["files"]:
1113                 if 'hd' in config["video"]["files"][codec_name]:
1114                     files['hd'].append((codec_name, codec_extension, 'hd'))
1115                 elif 'sd' in config["video"]["files"][codec_name]:
1116                     files['sd'].append((codec_name, codec_extension, 'sd'))
1117                 else:
1118                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1119
1120         for quality in ('hd', 'sd', 'other'):
1121             if len(files[quality]) > 0:
1122                 video_quality = files[quality][0][2]
1123                 video_codec = files[quality][0][0]
1124                 video_extension = files[quality][0][1]
1125                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1126                 break
1127         else:
1128             self._downloader.trouble(u'ERROR: no known codec found')
1129             return
1130
1131         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1132                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1133
1134         return [{
1135             'id':       video_id,
1136             'url':      video_url,
1137             'uploader': video_uploader,
1138             'upload_date':  video_upload_date,
1139             'title':    video_title,
1140             'ext':      video_extension,
1141             'thumbnail':    video_thumbnail,
1142             'description':  video_description,
1143         }]
1144
1145
1146 class ArteTvIE(InfoExtractor):
1147     """arte.tv information extractor."""
1148
1149     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1150     _LIVE_URL = r'index-[0-9]+\.html$'
1151
1152     IE_NAME = u'arte.tv'
1153
1154     def __init__(self, downloader=None):
1155         InfoExtractor.__init__(self, downloader)
1156
1157     def report_download_webpage(self, video_id):
1158         """Report webpage download."""
1159         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1160
1161     def report_extraction(self, video_id):
1162         """Report information extraction."""
1163         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1164
1165     def fetch_webpage(self, url):
1166         self._downloader.increment_downloads()
1167         request = compat_urllib_request.Request(url)
1168         try:
1169             self.report_download_webpage(url)
1170             webpage = compat_urllib_request.urlopen(request).read()
1171         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1172             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1173             return
1174         except ValueError as err:
1175             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1176             return
1177         return webpage
1178
1179     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1180         page = self.fetch_webpage(url)
1181         mobj = re.search(regex, page, regexFlags)
1182         info = {}
1183
1184         if mobj is None:
1185             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1186             return
1187
1188         for (i, key, err) in matchTuples:
1189             if mobj.group(i) is None:
1190                 self._downloader.trouble(err)
1191                 return
1192             else:
1193                 info[key] = mobj.group(i)
1194
1195         return info
1196
1197     def extractLiveStream(self, url):
1198         video_lang = url.split('/')[-4]
1199         info = self.grep_webpage(
1200             url,
1201             r'src="(.*?/videothek_js.*?\.js)',
1202             0,
1203             [
1204                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1205             ]
1206         )
1207         http_host = url.split('/')[2]
1208         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1209         info = self.grep_webpage(
1210             next_url,
1211             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1212                 '(http://.*?\.swf).*?' +
1213                 '(rtmp://.*?)\'',
1214             re.DOTALL,
1215             [
1216                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1217                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1218                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1219             ]
1220         )
1221         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1222
1223     def extractPlus7Stream(self, url):
1224         video_lang = url.split('/')[-3]
1225         info = self.grep_webpage(
1226             url,
1227             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1228             0,
1229             [
1230                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1231             ]
1232         )
1233         next_url = compat_urllib_parse.unquote(info.get('url'))
1234         info = self.grep_webpage(
1235             next_url,
1236             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1237             0,
1238             [
1239                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1240             ]
1241         )
1242         next_url = compat_urllib_parse.unquote(info.get('url'))
1243
1244         info = self.grep_webpage(
1245             next_url,
1246             r'<video id="(.*?)".*?>.*?' +
1247                 '<name>(.*?)</name>.*?' +
1248                 '<dateVideo>(.*?)</dateVideo>.*?' +
1249                 '<url quality="hd">(.*?)</url>',
1250             re.DOTALL,
1251             [
1252                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1253                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1254                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1255                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1256             ]
1257         )
1258
1259         return {
1260             'id':           info.get('id'),
1261             'url':          compat_urllib_parse.unquote(info.get('url')),
1262             'uploader':     u'arte.tv',
1263             'upload_date':  info.get('date'),
1264             'title':        info.get('title'),
1265             'ext':          u'mp4',
1266             'format':       u'NA',
1267             'player_url':   None,
1268         }
1269
1270     def _real_extract(self, url):
1271         video_id = url.split('/')[-1]
1272         self.report_extraction(video_id)
1273
1274         if re.search(self._LIVE_URL, video_id) is not None:
1275             self.extractLiveStream(url)
1276             return
1277         else:
1278             info = self.extractPlus7Stream(url)
1279
1280         return [info]
1281
1282
1283 class GenericIE(InfoExtractor):
1284     """Generic last-resort information extractor."""
1285
1286     _VALID_URL = r'.*'
1287     IE_NAME = u'generic'
1288
1289     def __init__(self, downloader=None):
1290         InfoExtractor.__init__(self, downloader)
1291
1292     def report_download_webpage(self, video_id):
1293         """Report webpage download."""
1294         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1295         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1296
1297     def report_extraction(self, video_id):
1298         """Report information extraction."""
1299         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1300
1301     def report_following_redirect(self, new_url):
1302         """Report information extraction."""
1303         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1304
1305     def _test_redirect(self, url):
1306         """Check if it is a redirect, like url shorteners, in case restart chain."""
1307         class HeadRequest(compat_urllib_request.Request):
1308             def get_method(self):
1309                 return "HEAD"
1310
1311         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1312             """
1313             Subclass the HTTPRedirectHandler to make it use our
1314             HeadRequest also on the redirected URL
1315             """
1316             def redirect_request(self, req, fp, code, msg, headers, newurl):
1317                 if code in (301, 302, 303, 307):
1318                     newurl = newurl.replace(' ', '%20')
1319                     newheaders = dict((k,v) for k,v in req.headers.items()
1320                                       if k.lower() not in ("content-length", "content-type"))
1321                     return HeadRequest(newurl,
1322                                        headers=newheaders,
1323                                        origin_req_host=req.get_origin_req_host(),
1324                                        unverifiable=True)
1325                 else:
1326                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1327
1328         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1329             """
1330             Fallback to GET if HEAD is not allowed (405 HTTP error)
1331             """
1332             def http_error_405(self, req, fp, code, msg, headers):
1333                 fp.read()
1334                 fp.close()
1335
1336                 newheaders = dict((k,v) for k,v in req.headers.items()
1337                                   if k.lower() not in ("content-length", "content-type"))
1338                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1339                                                  headers=newheaders,
1340                                                  origin_req_host=req.get_origin_req_host(),
1341                                                  unverifiable=True))
1342
1343         # Build our opener
1344         opener = compat_urllib_request.OpenerDirector()
1345         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1346                         HTTPMethodFallback, HEADRedirectHandler,
1347                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1348             opener.add_handler(handler())
1349
1350         response = opener.open(HeadRequest(url))
1351         new_url = response.geturl()
1352
1353         if url == new_url:
1354             return False
1355
1356         self.report_following_redirect(new_url)
1357         self._downloader.download([new_url])
1358         return True
1359
1360     def _real_extract(self, url):
1361         if self._test_redirect(url): return
1362
1363         video_id = url.split('/')[-1]
1364         request = compat_urllib_request.Request(url)
1365         try:
1366             self.report_download_webpage(video_id)
1367             webpage = compat_urllib_request.urlopen(request).read()
1368         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1369             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1370             return
1371         except ValueError as err:
1372             # since this is the last-resort InfoExtractor, if
1373             # this error is thrown, it'll be thrown here
1374             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1375             return
1376
1377         self.report_extraction(video_id)
1378         # Start with something easy: JW Player in SWFObject
1379         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1380         if mobj is None:
1381             # Broaden the search a little bit
1382             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1383         if mobj is None:
1384             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1385             return
1386
1387         # It's possible that one of the regexes
1388         # matched, but returned an empty group:
1389         if mobj.group(1) is None:
1390             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1391             return
1392
1393         video_url = compat_urllib_parse.unquote(mobj.group(1))
1394         video_id = os.path.basename(video_url)
1395
1396         # here's a fun little line of code for you:
1397         video_extension = os.path.splitext(video_id)[1][1:]
1398         video_id = os.path.splitext(video_id)[0]
1399
1400         # it's tempting to parse this further, but you would
1401         # have to take into account all the variations like
1402         #   Video Title - Site Name
1403         #   Site Name | Video Title
1404         #   Video Title - Tagline | Site Name
1405         # and so on and so forth; it's just not practical
1406         mobj = re.search(r'<title>(.*)</title>', webpage)
1407         if mobj is None:
1408             self._downloader.trouble(u'ERROR: unable to extract title')
1409             return
1410         video_title = mobj.group(1).decode('utf-8')
1411
1412         # video uploader is domain name
1413         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1414         if mobj is None:
1415             self._downloader.trouble(u'ERROR: unable to extract title')
1416             return
1417         video_uploader = mobj.group(1).decode('utf-8')
1418
1419         return [{
1420             'id':       video_id.decode('utf-8'),
1421             'url':      video_url.decode('utf-8'),
1422             'uploader': video_uploader,
1423             'upload_date':  None,
1424             'title':    video_title,
1425             'ext':      video_extension.decode('utf-8'),
1426         }]
1427
1428
1429 class YoutubeSearchIE(InfoExtractor):
1430     """Information Extractor for YouTube search queries."""
1431     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1432     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1433     _max_youtube_results = 1000
1434     IE_NAME = u'youtube:search'
1435
1436     def __init__(self, downloader=None):
1437         InfoExtractor.__init__(self, downloader)
1438
1439     def report_download_page(self, query, pagenum):
1440         """Report attempt to download search page with given number."""
1441         query = query.decode(preferredencoding())
1442         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1443
1444     def _real_extract(self, query):
1445         mobj = re.match(self._VALID_URL, query)
1446         if mobj is None:
1447             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1448             return
1449
1450         prefix, query = query.split(':')
1451         prefix = prefix[8:]
1452         query = query.encode('utf-8')
1453         if prefix == '':
1454             self._download_n_results(query, 1)
1455             return
1456         elif prefix == 'all':
1457             self._download_n_results(query, self._max_youtube_results)
1458             return
1459         else:
1460             try:
1461                 n = int(prefix)
1462                 if n <= 0:
1463                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1464                     return
1465                 elif n > self._max_youtube_results:
1466                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1467                     n = self._max_youtube_results
1468                 self._download_n_results(query, n)
1469                 return
1470             except ValueError: # parsing prefix as integer fails
1471                 self._download_n_results(query, 1)
1472                 return
1473
1474     def _download_n_results(self, query, n):
1475         """Downloads a specified number of results for a query"""
1476
1477         video_ids = []
1478         pagenum = 0
1479         limit = n
1480
1481         while (50 * pagenum) < limit:
1482             self.report_download_page(query, pagenum+1)
1483             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1484             request = compat_urllib_request.Request(result_url)
1485             try:
1486                 data = compat_urllib_request.urlopen(request).read()
1487             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1488                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1489                 return
1490             api_response = json.loads(data)['data']
1491
1492             new_ids = list(video['id'] for video in api_response['items'])
1493             video_ids += new_ids
1494
1495             limit = min(n, api_response['totalItems'])
1496             pagenum += 1
1497
1498         if len(video_ids) > n:
1499             video_ids = video_ids[:n]
1500         for id in video_ids:
1501             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1502         return
1503
1504
1505 class GoogleSearchIE(InfoExtractor):
1506     """Information Extractor for Google Video search queries."""
1507     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1508     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1509     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1510     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1511     _max_google_results = 1000
1512     IE_NAME = u'video.google:search'
1513
1514     def __init__(self, downloader=None):
1515         InfoExtractor.__init__(self, downloader)
1516
1517     def report_download_page(self, query, pagenum):
1518         """Report attempt to download playlist page with given number."""
1519         query = query.decode(preferredencoding())
1520         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1521
1522     def _real_extract(self, query):
1523         mobj = re.match(self._VALID_URL, query)
1524         if mobj is None:
1525             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1526             return
1527
1528         prefix, query = query.split(':')
1529         prefix = prefix[8:]
1530         query = query.encode('utf-8')
1531         if prefix == '':
1532             self._download_n_results(query, 1)
1533             return
1534         elif prefix == 'all':
1535             self._download_n_results(query, self._max_google_results)
1536             return
1537         else:
1538             try:
1539                 n = int(prefix)
1540                 if n <= 0:
1541                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1542                     return
1543                 elif n > self._max_google_results:
1544                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1545                     n = self._max_google_results
1546                 self._download_n_results(query, n)
1547                 return
1548             except ValueError: # parsing prefix as integer fails
1549                 self._download_n_results(query, 1)
1550                 return
1551
1552     def _download_n_results(self, query, n):
1553         """Downloads a specified number of results for a query"""
1554
1555         video_ids = []
1556         pagenum = 0
1557
1558         while True:
1559             self.report_download_page(query, pagenum)
1560             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1561             request = compat_urllib_request.Request(result_url)
1562             try:
1563                 page = compat_urllib_request.urlopen(request).read()
1564             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1565                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1566                 return
1567
1568             # Extract video identifiers
1569             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1570                 video_id = mobj.group(1)
1571                 if video_id not in video_ids:
1572                     video_ids.append(video_id)
1573                     if len(video_ids) == n:
1574                         # Specified n videos reached
1575                         for id in video_ids:
1576                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1577                         return
1578
1579             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1580                 for id in video_ids:
1581                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1582                 return
1583
1584             pagenum = pagenum + 1
1585
1586
1587 class YahooSearchIE(InfoExtractor):
1588     """Information Extractor for Yahoo! Video search queries."""
1589     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1590     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1591     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1592     _MORE_PAGES_INDICATOR = r'\s*Next'
1593     _max_yahoo_results = 1000
1594     IE_NAME = u'video.yahoo:search'
1595
1596     def __init__(self, downloader=None):
1597         InfoExtractor.__init__(self, downloader)
1598
1599     def report_download_page(self, query, pagenum):
1600         """Report attempt to download playlist page with given number."""
1601         query = query.decode(preferredencoding())
1602         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1603
1604     def _real_extract(self, query):
1605         mobj = re.match(self._VALID_URL, query)
1606         if mobj is None:
1607             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1608             return
1609
1610         prefix, query = query.split(':')
1611         prefix = prefix[8:]
1612         query = query.encode('utf-8')
1613         if prefix == '':
1614             self._download_n_results(query, 1)
1615             return
1616         elif prefix == 'all':
1617             self._download_n_results(query, self._max_yahoo_results)
1618             return
1619         else:
1620             try:
1621                 n = int(prefix)
1622                 if n <= 0:
1623                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1624                     return
1625                 elif n > self._max_yahoo_results:
1626                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1627                     n = self._max_yahoo_results
1628                 self._download_n_results(query, n)
1629                 return
1630             except ValueError: # parsing prefix as integer fails
1631                 self._download_n_results(query, 1)
1632                 return
1633
1634     def _download_n_results(self, query, n):
1635         """Downloads a specified number of results for a query"""
1636
1637         video_ids = []
1638         already_seen = set()
1639         pagenum = 1
1640
1641         while True:
1642             self.report_download_page(query, pagenum)
1643             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1644             request = compat_urllib_request.Request(result_url)
1645             try:
1646                 page = compat_urllib_request.urlopen(request).read()
1647             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1648                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1649                 return
1650
1651             # Extract video identifiers
1652             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1653                 video_id = mobj.group(1)
1654                 if video_id not in already_seen:
1655                     video_ids.append(video_id)
1656                     already_seen.add(video_id)
1657                     if len(video_ids) == n:
1658                         # Specified n videos reached
1659                         for id in video_ids:
1660                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1661                         return
1662
1663             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1664                 for id in video_ids:
1665                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1666                 return
1667
1668             pagenum = pagenum + 1
1669
1670
1671 class YoutubePlaylistIE(InfoExtractor):
1672     """Information Extractor for YouTube playlists."""
1673
1674     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1675     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1676     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1677     _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1678     IE_NAME = u'youtube:playlist'
1679
1680     def __init__(self, downloader=None):
1681         InfoExtractor.__init__(self, downloader)
1682
1683     def report_download_page(self, playlist_id, pagenum):
1684         """Report attempt to download playlist page with given number."""
1685         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1686
1687     def _real_extract(self, url):
1688         # Extract playlist id
1689         mobj = re.match(self._VALID_URL, url)
1690         if mobj is None:
1691             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1692             return
1693
1694         # Single video case
1695         if mobj.group(3) is not None:
1696             self._downloader.download([mobj.group(3)])
1697             return
1698
1699         # Download playlist pages
1700         # prefix is 'p' as default for playlists but there are other types that need extra care
1701         playlist_prefix = mobj.group(1)
1702         if playlist_prefix == 'a':
1703             playlist_access = 'artist'
1704         else:
1705             playlist_prefix = 'p'
1706             playlist_access = 'view_play_list'
1707         playlist_id = mobj.group(2)
1708         video_ids = []
1709         pagenum = 1
1710
1711         while True:
1712             self.report_download_page(playlist_id, pagenum)
1713             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1714             request = compat_urllib_request.Request(url)
1715             try:
1716                 page = compat_urllib_request.urlopen(request).read()
1717             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1718                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1719                 return
1720
1721             # Extract video identifiers
1722             ids_in_page = []
1723             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1724                 if mobj.group(1) not in ids_in_page:
1725                     ids_in_page.append(mobj.group(1))
1726             video_ids.extend(ids_in_page)
1727
1728             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1729                 break
1730             pagenum = pagenum + 1
1731
1732         playliststart = self._downloader.params.get('playliststart', 1) - 1
1733         playlistend = self._downloader.params.get('playlistend', -1)
1734         if playlistend == -1:
1735             video_ids = video_ids[playliststart:]
1736         else:
1737             video_ids = video_ids[playliststart:playlistend]
1738
1739         for id in video_ids:
1740             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1741         return
1742
1743
1744 class YoutubeChannelIE(InfoExtractor):
1745     """Information Extractor for YouTube channels."""
1746
1747     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1748     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1749     _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1750     IE_NAME = u'youtube:channel'
1751
1752     def report_download_page(self, channel_id, pagenum):
1753         """Report attempt to download channel page with given number."""
1754         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1755
1756     def _real_extract(self, url):
1757         # Extract channel id
1758         mobj = re.match(self._VALID_URL, url)
1759         if mobj is None:
1760             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1761             return
1762
1763         # Download channel pages
1764         channel_id = mobj.group(1)
1765         video_ids = []
1766         pagenum = 1
1767
1768         while True:
1769             self.report_download_page(channel_id, pagenum)
1770             url = self._TEMPLATE_URL % (channel_id, pagenum)
1771             request = compat_urllib_request.Request(url)
1772             try:
1773                 page = compat_urllib_request.urlopen(request).read()
1774             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1775                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1776                 return
1777
1778             # Extract video identifiers
1779             ids_in_page = []
1780             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1781                 if mobj.group(1) not in ids_in_page:
1782                     ids_in_page.append(mobj.group(1))
1783             video_ids.extend(ids_in_page)
1784
1785             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1786                 break
1787             pagenum = pagenum + 1
1788
1789         for id in video_ids:
1790             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1791         return
1792
1793
1794 class YoutubeUserIE(InfoExtractor):
1795     """Information Extractor for YouTube users."""
1796
1797     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1798     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1799     _GDATA_PAGE_SIZE = 50
1800     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1801     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1802     IE_NAME = u'youtube:user'
1803
1804     def __init__(self, downloader=None):
1805         InfoExtractor.__init__(self, downloader)
1806
1807     def report_download_page(self, username, start_index):
1808         """Report attempt to download user page."""
1809         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1810                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1811
1812     def _real_extract(self, url):
1813         # Extract username
1814         mobj = re.match(self._VALID_URL, url)
1815         if mobj is None:
1816             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1817             return
1818
1819         username = mobj.group(1)
1820
1821         # Download video ids using YouTube Data API. Result size per
1822         # query is limited (currently to 50 videos) so we need to query
1823         # page by page until there are no video ids - it means we got
1824         # all of them.
1825
1826         video_ids = []
1827         pagenum = 0
1828
1829         while True:
1830             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1831             self.report_download_page(username, start_index)
1832
1833             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1834
1835             try:
1836                 page = compat_urllib_request.urlopen(request).read()
1837             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1838                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1839                 return
1840
1841             # Extract video identifiers
1842             ids_in_page = []
1843
1844             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1845                 if mobj.group(1) not in ids_in_page:
1846                     ids_in_page.append(mobj.group(1))
1847
1848             video_ids.extend(ids_in_page)
1849
1850             # A little optimization - if current page is not
1851             # "full", ie. does not contain PAGE_SIZE video ids then
1852             # we can assume that this page is the last one - there
1853             # are no more ids on further pages - no need to query
1854             # again.
1855
1856             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1857                 break
1858
1859             pagenum += 1
1860
1861         all_ids_count = len(video_ids)
1862         playliststart = self._downloader.params.get('playliststart', 1) - 1
1863         playlistend = self._downloader.params.get('playlistend', -1)
1864
1865         if playlistend == -1:
1866             video_ids = video_ids[playliststart:]
1867         else:
1868             video_ids = video_ids[playliststart:playlistend]
1869
1870         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1871                 (username, all_ids_count, len(video_ids)))
1872
1873         for video_id in video_ids:
1874             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1875
1876
1877 class BlipTVUserIE(InfoExtractor):
1878     """Information Extractor for blip.tv users."""
1879
1880     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1881     _PAGE_SIZE = 12
1882     IE_NAME = u'blip.tv:user'
1883
1884     def __init__(self, downloader=None):
1885         InfoExtractor.__init__(self, downloader)
1886
1887     def report_download_page(self, username, pagenum):
1888         """Report attempt to download user page."""
1889         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1890                 (self.IE_NAME, username, pagenum))
1891
1892     def _real_extract(self, url):
1893         # Extract username
1894         mobj = re.match(self._VALID_URL, url)
1895         if mobj is None:
1896             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1897             return
1898
1899         username = mobj.group(1)
1900
1901         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1902
1903         request = compat_urllib_request.Request(url)
1904
1905         try:
1906             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1907             mobj = re.search(r'data-users-id="([^"]+)"', page)
1908             page_base = page_base % mobj.group(1)
1909         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1910             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1911             return
1912
1913
1914         # Download video ids using BlipTV Ajax calls. Result size per
1915         # query is limited (currently to 12 videos) so we need to query
1916         # page by page until there are no video ids - it means we got
1917         # all of them.
1918
1919         video_ids = []
1920         pagenum = 1
1921
1922         while True:
1923             self.report_download_page(username, pagenum)
1924
1925             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1926
1927             try:
1928                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1929             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1930                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1931                 return
1932
1933             # Extract video identifiers
1934             ids_in_page = []
1935
1936             for mobj in re.finditer(r'href="/([^"]+)"', page):
1937                 if mobj.group(1) not in ids_in_page:
1938                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1939
1940             video_ids.extend(ids_in_page)
1941
1942             # A little optimization - if current page is not
1943             # "full", ie. does not contain PAGE_SIZE video ids then
1944             # we can assume that this page is the last one - there
1945             # are no more ids on further pages - no need to query
1946             # again.
1947
1948             if len(ids_in_page) < self._PAGE_SIZE:
1949                 break
1950
1951             pagenum += 1
1952
1953         all_ids_count = len(video_ids)
1954         playliststart = self._downloader.params.get('playliststart', 1) - 1
1955         playlistend = self._downloader.params.get('playlistend', -1)
1956
1957         if playlistend == -1:
1958             video_ids = video_ids[playliststart:]
1959         else:
1960             video_ids = video_ids[playliststart:playlistend]
1961
1962         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1963                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1964
1965         for video_id in video_ids:
1966             self._downloader.download([u'http://blip.tv/'+video_id])
1967
1968
1969 class DepositFilesIE(InfoExtractor):
1970     """Information extractor for depositfiles.com"""
1971
1972     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1973     IE_NAME = u'DepositFiles'
1974
1975     def __init__(self, downloader=None):
1976         InfoExtractor.__init__(self, downloader)
1977
1978     def report_download_webpage(self, file_id):
1979         """Report webpage download."""
1980         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1981
1982     def report_extraction(self, file_id):
1983         """Report information extraction."""
1984         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1985
1986     def _real_extract(self, url):
1987         file_id = url.split('/')[-1]
1988         # Rebuild url in english locale
1989         url = 'http://depositfiles.com/en/files/' + file_id
1990
1991         # Retrieve file webpage with 'Free download' button pressed
1992         free_download_indication = { 'gateway_result' : '1' }
1993         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1994         try:
1995             self.report_download_webpage(file_id)
1996             webpage = compat_urllib_request.urlopen(request).read()
1997         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1998             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1999             return
2000
2001         # Search for the real file URL
2002         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2003         if (mobj is None) or (mobj.group(1) is None):
2004             # Try to figure out reason of the error.
2005             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2006             if (mobj is not None) and (mobj.group(1) is not None):
2007                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2008                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2009             else:
2010                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2011             return
2012
2013         file_url = mobj.group(1)
2014         file_extension = os.path.splitext(file_url)[1][1:]
2015
2016         # Search for file title
2017         mobj = re.search(r'<b title="(.*?)">', webpage)
2018         if mobj is None:
2019             self._downloader.trouble(u'ERROR: unable to extract title')
2020             return
2021         file_title = mobj.group(1).decode('utf-8')
2022
2023         return [{
2024             'id':       file_id.decode('utf-8'),
2025             'url':      file_url.decode('utf-8'),
2026             'uploader': None,
2027             'upload_date':  None,
2028             'title':    file_title,
2029             'ext':      file_extension.decode('utf-8'),
2030         }]
2031
2032
2033 class FacebookIE(InfoExtractor):
2034     """Information Extractor for Facebook"""
2035
2036     _WORKING = False
2037     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2038     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2039     _NETRC_MACHINE = 'facebook'
2040     _available_formats = ['video', 'highqual', 'lowqual']
2041     _video_extensions = {
2042         'video': 'mp4',
2043         'highqual': 'mp4',
2044         'lowqual': 'mp4',
2045     }
2046     IE_NAME = u'facebook'
2047
2048     def __init__(self, downloader=None):
2049         InfoExtractor.__init__(self, downloader)
2050
2051     def _reporter(self, message):
2052         """Add header and report message."""
2053         self._downloader.to_screen(u'[facebook] %s' % message)
2054
2055     def report_login(self):
2056         """Report attempt to log in."""
2057         self._reporter(u'Logging in')
2058
2059     def report_video_webpage_download(self, video_id):
2060         """Report attempt to download video webpage."""
2061         self._reporter(u'%s: Downloading video webpage' % video_id)
2062
2063     def report_information_extraction(self, video_id):
2064         """Report attempt to extract video information."""
2065         self._reporter(u'%s: Extracting video information' % video_id)
2066
2067     def _parse_page(self, video_webpage):
2068         """Extract video information from page"""
2069         # General data
2070         data = {'title': r'\("video_title", "(.*?)"\)',
2071             'description': r'<div class="datawrap">(.*?)</div>',
2072             'owner': r'\("video_owner_name", "(.*?)"\)',
2073             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2074             }
2075         video_info = {}
2076         for piece in data.keys():
2077             mobj = re.search(data[piece], video_webpage)
2078             if mobj is not None:
2079                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2080
2081         # Video urls
2082         video_urls = {}
2083         for fmt in self._available_formats:
2084             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2085             if mobj is not None:
2086                 # URL is in a Javascript segment inside an escaped Unicode format within
2087                 # the generally utf-8 page
2088                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2089         video_info['video_urls'] = video_urls
2090
2091         return video_info
2092
2093     def _real_initialize(self):
2094         if self._downloader is None:
2095             return
2096
2097         useremail = None
2098         password = None
2099         downloader_params = self._downloader.params
2100
2101         # Attempt to use provided username and password or .netrc data
2102         if downloader_params.get('username', None) is not None:
2103             useremail = downloader_params['username']
2104             password = downloader_params['password']
2105         elif downloader_params.get('usenetrc', False):
2106             try:
2107                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2108                 if info is not None:
2109                     useremail = info[0]
2110                     password = info[2]
2111                 else:
2112                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2113             except (IOError, netrc.NetrcParseError) as err:
2114                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2115                 return
2116
2117         if useremail is None:
2118             return
2119
2120         # Log in
2121         login_form = {
2122             'email': useremail,
2123             'pass': password,
2124             'login': 'Log+In'
2125             }
2126         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2127         try:
2128             self.report_login()
2129             login_results = compat_urllib_request.urlopen(request).read()
2130             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2131                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2132                 return
2133         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2134             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2135             return
2136
2137     def _real_extract(self, url):
2138         mobj = re.match(self._VALID_URL, url)
2139         if mobj is None:
2140             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2141             return
2142         video_id = mobj.group('ID')
2143
2144         # Get video webpage
2145         self.report_video_webpage_download(video_id)
2146         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2147         try:
2148             page = compat_urllib_request.urlopen(request)
2149             video_webpage = page.read()
2150         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2151             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2152             return
2153
2154         # Start extracting information
2155         self.report_information_extraction(video_id)
2156
2157         # Extract information
2158         video_info = self._parse_page(video_webpage)
2159
2160         # uploader
2161         if 'owner' not in video_info:
2162             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2163             return
2164         video_uploader = video_info['owner']
2165
2166         # title
2167         if 'title' not in video_info:
2168             self._downloader.trouble(u'ERROR: unable to extract video title')
2169             return
2170         video_title = video_info['title']
2171         video_title = video_title.decode('utf-8')
2172
2173         # thumbnail image
2174         if 'thumbnail' not in video_info:
2175             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2176             video_thumbnail = ''
2177         else:
2178             video_thumbnail = video_info['thumbnail']
2179
2180         # upload date
2181         upload_date = None
2182         if 'upload_date' in video_info:
2183             upload_time = video_info['upload_date']
2184             timetuple = email.utils.parsedate_tz(upload_time)
2185             if timetuple is not None:
2186                 try:
2187                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2188                 except:
2189                     pass
2190
2191         # description
2192         video_description = video_info.get('description', 'No description available.')
2193
2194         url_map = video_info['video_urls']
2195         if len(url_map.keys()) > 0:
2196             # Decide which formats to download
2197             req_format = self._downloader.params.get('format', None)
2198             format_limit = self._downloader.params.get('format_limit', None)
2199
2200             if format_limit is not None and format_limit in self._available_formats:
2201                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2202             else:
2203                 format_list = self._available_formats
2204             existing_formats = [x for x in format_list if x in url_map]
2205             if len(existing_formats) == 0:
2206                 self._downloader.trouble(u'ERROR: no known formats available for video')
2207                 return
2208             if req_format is None:
2209                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2210             elif req_format == 'worst':
2211                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2212             elif req_format == '-1':
2213                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2214             else:
2215                 # Specific format
2216                 if req_format not in url_map:
2217                     self._downloader.trouble(u'ERROR: requested format not available')
2218                     return
2219                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2220
2221         results = []
2222         for format_param, video_real_url in video_url_list:
2223             # Extension
2224             video_extension = self._video_extensions.get(format_param, 'mp4')
2225
2226             results.append({
2227                 'id':       video_id.decode('utf-8'),
2228                 'url':      video_real_url.decode('utf-8'),
2229                 'uploader': video_uploader.decode('utf-8'),
2230                 'upload_date':  upload_date,
2231                 'title':    video_title,
2232                 'ext':      video_extension.decode('utf-8'),
2233                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2234                 'thumbnail':    video_thumbnail.decode('utf-8'),
2235                 'description':  video_description.decode('utf-8'),
2236             })
2237         return results
2238
2239 class BlipTVIE(InfoExtractor):
2240     """Information extractor for blip.tv"""
2241
2242     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2243     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2244     IE_NAME = u'blip.tv'
2245
2246     def report_extraction(self, file_id):
2247         """Report information extraction."""
2248         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2249
2250     def report_direct_download(self, title):
2251         """Report information extraction."""
2252         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2253
2254     def _real_extract(self, url):
2255         mobj = re.match(self._VALID_URL, url)
2256         if mobj is None:
2257             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2258             return
2259
2260         if '?' in url:
2261             cchar = '&'
2262         else:
2263             cchar = '?'
2264         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2265         request = compat_urllib_request.Request(json_url.encode('utf-8'))
2266         self.report_extraction(mobj.group(1))
2267         info = None
2268         try:
2269             urlh = compat_urllib_request.urlopen(request)
2270             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2271                 basename = url.split('/')[-1]
2272                 title,ext = os.path.splitext(basename)
2273                 title = title.decode('UTF-8')
2274                 ext = ext.replace('.', '')
2275                 self.report_direct_download(title)
2276                 info = {
2277                     'id': title,
2278                     'url': url,
2279                     'uploader': None,
2280                     'upload_date': None,
2281                     'title': title,
2282                     'ext': ext,
2283                     'urlhandle': urlh
2284                 }
2285         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2286             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2287             return
2288         if info is None: # Regular URL
2289             try:
2290                 json_code = urlh.read()
2291             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2292                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2293                 return
2294
2295             try:
2296                 json_data = json.loads(json_code)
2297                 if 'Post' in json_data:
2298                     data = json_data['Post']
2299                 else:
2300                     data = json_data
2301
2302                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2303                 video_url = data['media']['url']
2304                 umobj = re.match(self._URL_EXT, video_url)
2305                 if umobj is None:
2306                     raise ValueError('Can not determine filename extension')
2307                 ext = umobj.group(1)
2308
2309                 info = {
2310                     'id': data['item_id'],
2311                     'url': video_url,
2312                     'uploader': data['display_name'],
2313                     'upload_date': upload_date,
2314                     'title': data['title'],
2315                     'ext': ext,
2316                     'format': data['media']['mimeType'],
2317                     'thumbnail': data['thumbnailUrl'],
2318                     'description': data['description'],
2319                     'player_url': data['embedUrl']
2320                 }
2321             except (ValueError,KeyError) as err:
2322                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2323                 return
2324
2325         std_headers['User-Agent'] = 'iTunes/10.6.1'
2326         return [info]
2327
2328
2329 class MyVideoIE(InfoExtractor):
2330     """Information Extractor for myvideo.de."""
2331
2332     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2333     IE_NAME = u'myvideo'
2334
2335     def __init__(self, downloader=None):
2336         InfoExtractor.__init__(self, downloader)
2337
2338     def report_download_webpage(self, video_id):
2339         """Report webpage download."""
2340         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2341
2342     def report_extraction(self, video_id):
2343         """Report information extraction."""
2344         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2345
2346     def _real_extract(self,url):
2347         mobj = re.match(self._VALID_URL, url)
2348         if mobj is None:
2349             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2350             return
2351
2352         video_id = mobj.group(1)
2353
2354         # Get video webpage
2355         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2356         try:
2357             self.report_download_webpage(video_id)
2358             webpage = compat_urllib_request.urlopen(request).read()
2359         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2360             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2361             return
2362
2363         self.report_extraction(video_id)
2364         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2365                  webpage)
2366         if mobj is None:
2367             self._downloader.trouble(u'ERROR: unable to extract media URL')
2368             return
2369         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2370
2371         mobj = re.search('<title>([^<]+)</title>', webpage)
2372         if mobj is None:
2373             self._downloader.trouble(u'ERROR: unable to extract title')
2374             return
2375
2376         video_title = mobj.group(1)
2377
2378         return [{
2379             'id':       video_id,
2380             'url':      video_url,
2381             'uploader': None,
2382             'upload_date':  None,
2383             'title':    video_title,
2384             'ext':      u'flv',
2385         }]
2386
2387 class ComedyCentralIE(InfoExtractor):
2388     """Information extractor for The Daily Show and Colbert Report """
2389
2390     _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2391     IE_NAME = u'comedycentral'
2392
2393     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2394
2395     _video_extensions = {
2396         '3500': 'mp4',
2397         '2200': 'mp4',
2398         '1700': 'mp4',
2399         '1200': 'mp4',
2400         '750': 'mp4',
2401         '400': 'mp4',
2402     }
2403     _video_dimensions = {
2404         '3500': '1280x720',
2405         '2200': '960x540',
2406         '1700': '768x432',
2407         '1200': '640x360',
2408         '750': '512x288',
2409         '400': '384x216',
2410     }
2411
2412     def report_extraction(self, episode_id):
2413         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2414
2415     def report_config_download(self, episode_id):
2416         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2417
2418     def report_index_download(self, episode_id):
2419         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2420
2421     def report_player_url(self, episode_id):
2422         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2423
2424
2425     def _print_formats(self, formats):
2426         print('Available formats:')
2427         for x in formats:
2428             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2429
2430
2431     def _real_extract(self, url):
2432         mobj = re.match(self._VALID_URL, url)
2433         if mobj is None:
2434             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2435             return
2436
2437         if mobj.group('shortname'):
2438             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2439                 url = u'http://www.thedailyshow.com/full-episodes/'
2440             else:
2441                 url = u'http://www.colbertnation.com/full-episodes/'
2442             mobj = re.match(self._VALID_URL, url)
2443             assert mobj is not None
2444
2445         dlNewest = not mobj.group('episode')
2446         if dlNewest:
2447             epTitle = mobj.group('showname')
2448         else:
2449             epTitle = mobj.group('episode')
2450
2451         req = compat_urllib_request.Request(url)
2452         self.report_extraction(epTitle)
2453         try:
2454             htmlHandle = compat_urllib_request.urlopen(req)
2455             html = htmlHandle.read()
2456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2457             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2458             return
2459         if dlNewest:
2460             url = htmlHandle.geturl()
2461             mobj = re.match(self._VALID_URL, url)
2462             if mobj is None:
2463                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2464                 return
2465             if mobj.group('episode') == '':
2466                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2467                 return
2468             epTitle = mobj.group('episode')
2469
2470         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2471
2472         if len(mMovieParams) == 0:
2473             # The Colbert Report embeds the information in a without
2474             # a URL prefix; so extract the alternate reference
2475             # and then add the URL prefix manually.
2476
2477             altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2478             if len(altMovieParams) == 0:
2479                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2480                 return
2481             else:
2482                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2483
2484         playerUrl_raw = mMovieParams[0][0]
2485         self.report_player_url(epTitle)
2486         try:
2487             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2488             playerUrl = urlHandle.geturl()
2489         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2490             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2491             return
2492
2493         uri = mMovieParams[0][1]
2494         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2495         self.report_index_download(epTitle)
2496         try:
2497             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2498         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2499             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2500             return
2501
2502         results = []
2503
2504         idoc = xml.etree.ElementTree.fromstring(indexXml)
2505         itemEls = idoc.findall('.//item')
2506         for itemEl in itemEls:
2507             mediaId = itemEl.findall('./guid')[0].text
2508             shortMediaId = mediaId.split(':')[-1]
2509             showId = mediaId.split(':')[-2].replace('.com', '')
2510             officialTitle = itemEl.findall('./title')[0].text
2511             officialDate = itemEl.findall('./pubDate')[0].text
2512
2513             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2514                         compat_urllib_parse.urlencode({'uri': mediaId}))
2515             configReq = compat_urllib_request.Request(configUrl)
2516             self.report_config_download(epTitle)
2517             try:
2518                 configXml = compat_urllib_request.urlopen(configReq).read()
2519             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2520                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2521                 return
2522
2523             cdoc = xml.etree.ElementTree.fromstring(configXml)
2524             turls = []
2525             for rendition in cdoc.findall('.//rendition'):
2526                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2527                 turls.append(finfo)
2528
2529             if len(turls) == 0:
2530                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2531                 continue
2532
2533             if self._downloader.params.get('listformats', None):
2534                 self._print_formats([i[0] for i in turls])
2535                 return
2536
2537             # For now, just pick the highest bitrate
2538             format,video_url = turls[-1]
2539
2540             # Get the format arg from the arg stream
2541             req_format = self._downloader.params.get('format', None)
2542
2543             # Select format if we can find one
2544             for f,v in turls:
2545                 if f == req_format:
2546                     format, video_url = f, v
2547                     break
2548
2549             # Patch to download from alternative CDN, which does not
2550             # break on current RTMPDump builds
2551             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2552             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2553
2554             if video_url.startswith(broken_cdn):
2555                 video_url = video_url.replace(broken_cdn, better_cdn)
2556
2557             effTitle = showId + u'-' + epTitle
2558             info = {
2559                 'id': shortMediaId,
2560                 'url': video_url,
2561                 'uploader': showId,
2562                 'upload_date': officialDate,
2563                 'title': effTitle,
2564                 'ext': 'mp4',
2565                 'format': format,
2566                 'thumbnail': None,
2567                 'description': officialTitle,
2568                 'player_url': None #playerUrl
2569             }
2570
2571             results.append(info)
2572
2573         return results
2574
2575
2576 class EscapistIE(InfoExtractor):
2577     """Information extractor for The Escapist """
2578
2579     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2580     IE_NAME = u'escapist'
2581
2582     def report_extraction(self, showName):
2583         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2584
2585     def report_config_download(self, showName):
2586         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2587
2588     def _real_extract(self, url):
2589         mobj = re.match(self._VALID_URL, url)
2590         if mobj is None:
2591             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2592             return
2593         showName = mobj.group('showname')
2594         videoId = mobj.group('episode')
2595
2596         self.report_extraction(showName)
2597         try:
2598             webPage = compat_urllib_request.urlopen(url)
2599             webPageBytes = webPage.read()
2600             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2601             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2602         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2603             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2604             return
2605
2606         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2607         description = unescapeHTML(descMatch.group(1))
2608         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2609         imgUrl = unescapeHTML(imgMatch.group(1))
2610         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2611         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2612         configUrlMatch = re.search('config=(.*)$', playerUrl)
2613         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2614
2615         self.report_config_download(showName)
2616         try:
2617             configJSON = compat_urllib_request.urlopen(configUrl).read()
2618         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2619             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2620             return
2621
2622         # Technically, it's JavaScript, not JSON
2623         configJSON = configJSON.replace("'", '"')
2624
2625         try:
2626             config = json.loads(configJSON)
2627         except (ValueError,) as err:
2628             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2629             return
2630
2631         playlist = config['playlist']
2632         videoUrl = playlist[1]['url']
2633
2634         info = {
2635             'id': videoId,
2636             'url': videoUrl,
2637             'uploader': showName,
2638             'upload_date': None,
2639             'title': showName,
2640             'ext': 'flv',
2641             'thumbnail': imgUrl,
2642             'description': description,
2643             'player_url': playerUrl,
2644         }
2645
2646         return [info]
2647
2648
2649 class CollegeHumorIE(InfoExtractor):
2650     """Information extractor for collegehumor.com"""
2651
2652     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2653     IE_NAME = u'collegehumor'
2654
2655     def report_manifest(self, video_id):
2656         """Report information extraction."""
2657         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2658
2659     def report_extraction(self, video_id):
2660         """Report information extraction."""
2661         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2662
2663     def _real_extract(self, url):
2664         mobj = re.match(self._VALID_URL, url)
2665         if mobj is None:
2666             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2667             return
2668         video_id = mobj.group('videoid')
2669
2670         info = {
2671             'id': video_id,
2672             'uploader': None,
2673             'upload_date': None,
2674         }
2675
2676         self.report_extraction(video_id)
2677         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2678         try:
2679             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2680         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2681             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2682             return
2683
2684         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2685         try:
2686             videoNode = mdoc.findall('./video')[0]
2687             info['description'] = videoNode.findall('./description')[0].text
2688             info['title'] = videoNode.findall('./caption')[0].text
2689             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2690             manifest_url = videoNode.findall('./file')[0].text
2691         except IndexError:
2692             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2693             return
2694
2695         manifest_url += '?hdcore=2.10.3'
2696         self.report_manifest(video_id)
2697         try:
2698             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2699         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2700             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2701             return
2702
2703         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2704         try:
2705             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2706             node_id = media_node.attrib['url']
2707             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2708         except IndexError as err:
2709             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2710             return
2711
2712         url_pr = compat_urllib_parse_urlparse(manifest_url)
2713         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2714
2715         info['url'] = url
2716         info['ext'] = 'f4f'
2717         return [info]
2718
2719
2720 class XVideosIE(InfoExtractor):
2721     """Information extractor for xvideos.com"""
2722
2723     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2724     IE_NAME = u'xvideos'
2725
2726     def report_webpage(self, video_id):
2727         """Report information extraction."""
2728         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2729
2730     def report_extraction(self, video_id):
2731         """Report information extraction."""
2732         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2733
2734     def _real_extract(self, url):
2735         mobj = re.match(self._VALID_URL, url)
2736         if mobj is None:
2737             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2738             return
2739         video_id = mobj.group(1).decode('utf-8')
2740
2741         self.report_webpage(video_id)
2742
2743         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2744         try:
2745             webpage = compat_urllib_request.urlopen(request).read()
2746         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2747             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2748             return
2749
2750         self.report_extraction(video_id)
2751
2752
2753         # Extract video URL
2754         mobj = re.search(r'flv_url=(.+?)&', webpage)
2755         if mobj is None:
2756             self._downloader.trouble(u'ERROR: unable to extract video url')
2757             return
2758         video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
2759
2760
2761         # Extract title
2762         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2763         if mobj is None:
2764             self._downloader.trouble(u'ERROR: unable to extract video title')
2765             return
2766         video_title = mobj.group(1).decode('utf-8')
2767
2768
2769         # Extract video thumbnail
2770         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2771         if mobj is None:
2772             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2773             return
2774         video_thumbnail = mobj.group(0).decode('utf-8')
2775
2776         info = {
2777             'id': video_id,
2778             'url': video_url,
2779             'uploader': None,
2780             'upload_date': None,
2781             'title': video_title,
2782             'ext': 'flv',
2783             'thumbnail': video_thumbnail,
2784             'description': None,
2785         }
2786
2787         return [info]
2788
2789
2790 class SoundcloudIE(InfoExtractor):
2791     """Information extractor for soundcloud.com
2792        To access the media, the uid of the song and a stream token
2793        must be extracted from the page source and the script must make
2794        a request to media.soundcloud.com/crossdomain.xml. Then
2795        the media can be grabbed by requesting from an url composed
2796        of the stream token and uid
2797      """
2798
2799     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2800     IE_NAME = u'soundcloud'
2801
2802     def __init__(self, downloader=None):
2803         InfoExtractor.__init__(self, downloader)
2804
2805     def report_webpage(self, video_id):
2806         """Report information extraction."""
2807         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2808
2809     def report_extraction(self, video_id):
2810         """Report information extraction."""
2811         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2812
2813     def _real_extract(self, url):
2814         mobj = re.match(self._VALID_URL, url)
2815         if mobj is None:
2816             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2817             return
2818
2819         # extract uploader (which is in the url)
2820         uploader = mobj.group(1).decode('utf-8')
2821         # extract simple title (uploader + slug of song title)
2822         slug_title =  mobj.group(2).decode('utf-8')
2823         simple_title = uploader + u'-' + slug_title
2824
2825         self.report_webpage('%s/%s' % (uploader, slug_title))
2826
2827         request = compat_urllib_request.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2828         try:
2829             webpage = compat_urllib_request.urlopen(request).read()
2830         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2831             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2832             return
2833
2834         self.report_extraction('%s/%s' % (uploader, slug_title))
2835
2836         # extract uid and stream token that soundcloud hands out for access
2837         mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2838         if mobj:
2839             video_id = mobj.group(1)
2840             stream_token = mobj.group(2)
2841
2842         # extract unsimplified title
2843         mobj = re.search('"title":"(.*?)",', webpage)
2844         if mobj:
2845             title = mobj.group(1).decode('utf-8')
2846         else:
2847             title = simple_title
2848
2849         # construct media url (with uid/token)
2850         mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2851         mediaURL = mediaURL % (video_id, stream_token)
2852
2853         # description
2854         description = u'No description available'
2855         mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2856         if mobj:
2857             description = mobj.group(1)
2858
2859         # upload date
2860         upload_date = None
2861         mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2862         if mobj:
2863             try:
2864                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2865             except Exception as err:
2866                 self._downloader.to_stderr(compat_str(err))
2867
2868         # for soundcloud, a request to a cross domain is required for cookies
2869         request = compat_urllib_request.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2870
2871         return [{
2872             'id':       video_id.decode('utf-8'),
2873             'url':      mediaURL,
2874             'uploader': uploader.decode('utf-8'),
2875             'upload_date':  upload_date,
2876             'title':    title,
2877             'ext':      u'mp3',
2878             'description': description.decode('utf-8')
2879         }]
2880
2881
2882 class InfoQIE(InfoExtractor):
2883     """Information extractor for infoq.com"""
2884
2885     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2886     IE_NAME = u'infoq'
2887
2888     def report_webpage(self, video_id):
2889         """Report information extraction."""
2890         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2891
2892     def report_extraction(self, video_id):
2893         """Report information extraction."""
2894         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2895
2896     def _real_extract(self, url):
2897         mobj = re.match(self._VALID_URL, url)
2898         if mobj is None:
2899             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2900             return
2901
2902         self.report_webpage(url)
2903
2904         request = compat_urllib_request.Request(url)
2905         try:
2906             webpage = compat_urllib_request.urlopen(request).read()
2907         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2908             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2909             return
2910
2911         self.report_extraction(url)
2912
2913
2914         # Extract video URL
2915         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2916         if mobj is None:
2917             self._downloader.trouble(u'ERROR: unable to extract video url')
2918             return
2919         video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2920
2921
2922         # Extract title
2923         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2924         if mobj is None:
2925             self._downloader.trouble(u'ERROR: unable to extract video title')
2926             return
2927         video_title = mobj.group(1).decode('utf-8')
2928
2929         # Extract description
2930         video_description = u'No description available.'
2931         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2932         if mobj is not None:
2933             video_description = mobj.group(1).decode('utf-8')
2934
2935         video_filename = video_url.split('/')[-1]
2936         video_id, extension = video_filename.split('.')
2937
2938         info = {
2939             'id': video_id,
2940             'url': video_url,
2941             'uploader': None,
2942             'upload_date': None,
2943             'title': video_title,
2944             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2945             'thumbnail': None,
2946             'description': video_description,
2947         }
2948
2949         return [info]
2950
2951 class MixcloudIE(InfoExtractor):
2952     """Information extractor for www.mixcloud.com"""
2953     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2954     IE_NAME = u'mixcloud'
2955
2956     def __init__(self, downloader=None):
2957         InfoExtractor.__init__(self, downloader)
2958
2959     def report_download_json(self, file_id):
2960         """Report JSON download."""
2961         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2962
2963     def report_extraction(self, file_id):
2964         """Report information extraction."""
2965         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2966
2967     def get_urls(self, jsonData, fmt, bitrate='best'):
2968         """Get urls from 'audio_formats' section in json"""
2969         file_url = None
2970         try:
2971             bitrate_list = jsonData[fmt]
2972             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2973                 bitrate = max(bitrate_list) # select highest
2974
2975             url_list = jsonData[fmt][bitrate]
2976         except TypeError: # we have no bitrate info.
2977             url_list = jsonData[fmt]
2978         return url_list
2979
2980     def check_urls(self, url_list):
2981         """Returns 1st active url from list"""
2982         for url in url_list:
2983             try:
2984                 compat_urllib_request.urlopen(url)
2985                 return url
2986             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2987                 url = None
2988
2989         return None
2990
2991     def _print_formats(self, formats):
2992         print('Available formats:')
2993         for fmt in formats.keys():
2994             for b in formats[fmt]:
2995                 try:
2996                     ext = formats[fmt][b][0]
2997                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2998                 except TypeError: # we have no bitrate info
2999                     ext = formats[fmt][0]
3000                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3001                     break
3002
3003     def _real_extract(self, url):
3004         mobj = re.match(self._VALID_URL, url)
3005         if mobj is None:
3006             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3007             return
3008         # extract uploader & filename from url
3009         uploader = mobj.group(1).decode('utf-8')
3010         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3011
3012         # construct API request
3013         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3014         # retrieve .json file with links to files
3015         request = compat_urllib_request.Request(file_url)
3016         try:
3017             self.report_download_json(file_url)
3018             jsonData = compat_urllib_request.urlopen(request).read()
3019         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3020             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3021             return
3022
3023         # parse JSON
3024         json_data = json.loads(jsonData)
3025         player_url = json_data['player_swf_url']
3026         formats = dict(json_data['audio_formats'])
3027
3028         req_format = self._downloader.params.get('format', None)
3029         bitrate = None
3030
3031         if self._downloader.params.get('listformats', None):
3032             self._print_formats(formats)
3033             return
3034
3035         if req_format is None or req_format == 'best':
3036             for format_param in formats.keys():
3037                 url_list = self.get_urls(formats, format_param)
3038                 # check urls
3039                 file_url = self.check_urls(url_list)
3040                 if file_url is not None:
3041                     break # got it!
3042         else:
3043             if req_format not in formats.keys():
3044                 self._downloader.trouble(u'ERROR: format is not available')
3045                 return
3046
3047             url_list = self.get_urls(formats, req_format)
3048             file_url = self.check_urls(url_list)
3049             format_param = req_format
3050
3051         return [{
3052             'id': file_id.decode('utf-8'),
3053             'url': file_url.decode('utf-8'),
3054             'uploader': uploader.decode('utf-8'),
3055             'upload_date': None,
3056             'title': json_data['name'],
3057             'ext': file_url.split('.')[-1].decode('utf-8'),
3058             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3059             'thumbnail': json_data['thumbnail_url'],
3060             'description': json_data['description'],
3061             'player_url': player_url.decode('utf-8'),
3062         }]
3063
3064 class StanfordOpenClassroomIE(InfoExtractor):
3065     """Information extractor for Stanford's Open ClassRoom"""
3066
3067     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3068     IE_NAME = u'stanfordoc'
3069
3070     def report_download_webpage(self, objid):
3071         """Report information extraction."""
3072         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3073
3074     def report_extraction(self, video_id):
3075         """Report information extraction."""
3076         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3077
3078     def _real_extract(self, url):
3079         mobj = re.match(self._VALID_URL, url)
3080         if mobj is None:
3081             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3082             return
3083
3084         if mobj.group('course') and mobj.group('video'): # A specific video
3085             course = mobj.group('course')
3086             video = mobj.group('video')
3087             info = {
3088                 'id': course + '_' + video,
3089                 'uploader': None,
3090                 'upload_date': None,
3091             }
3092
3093             self.report_extraction(info['id'])
3094             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3095             xmlUrl = baseUrl + video + '.xml'
3096             try:
3097                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3098             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3099                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3100                 return
3101             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3102             try:
3103                 info['title'] = mdoc.findall('./title')[0].text
3104                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3105             except IndexError:
3106                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3107                 return
3108             info['ext'] = info['url'].rpartition('.')[2]
3109             return [info]
3110         elif mobj.group('course'): # A course page
3111             course = mobj.group('course')
3112             info = {
3113                 'id': course,
3114                 'type': 'playlist',
3115                 'uploader': None,
3116                 'upload_date': None,
3117             }
3118
3119             self.report_download_webpage(info['id'])
3120             try:
3121                 coursepage = compat_urllib_request.urlopen(url).read()
3122             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3123                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3124                 return
3125
3126             m = re.search('<h1>([^<]+)</h1>', coursepage)
3127             if m:
3128                 info['title'] = unescapeHTML(m.group(1))
3129             else:
3130                 info['title'] = info['id']
3131
3132             m = re.search('<description>([^<]+)</description>', coursepage)
3133             if m:
3134                 info['description'] = unescapeHTML(m.group(1))
3135
3136             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3137             info['list'] = [
3138                 {
3139                     'type': 'reference',
3140                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3141                 }
3142                     for vpage in links]
3143             results = []
3144             for entry in info['list']:
3145                 assert entry['type'] == 'reference'
3146                 results += self.extract(entry['url'])
3147             return results
3148
3149         else: # Root page
3150             info = {
3151                 'id': 'Stanford OpenClassroom',
3152                 'type': 'playlist',
3153                 'uploader': None,
3154                 'upload_date': None,
3155             }
3156
3157             self.report_download_webpage(info['id'])
3158             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3159             try:
3160                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3161             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3162                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3163                 return
3164
3165             info['title'] = info['id']
3166
3167             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3168             info['list'] = [
3169                 {
3170                     'type': 'reference',
3171                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3172                 }
3173                     for cpage in links]
3174
3175             results = []
3176             for entry in info['list']:
3177                 assert entry['type'] == 'reference'
3178                 results += self.extract(entry['url'])
3179             return results
3180
3181 class MTVIE(InfoExtractor):
3182     """Information extractor for MTV.com"""
3183
3184     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3185     IE_NAME = u'mtv'
3186
3187     def report_webpage(self, video_id):
3188         """Report information extraction."""
3189         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3190
3191     def report_extraction(self, video_id):
3192         """Report information extraction."""
3193         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3194
3195     def _real_extract(self, url):
3196         mobj = re.match(self._VALID_URL, url)
3197         if mobj is None:
3198             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3199             return
3200         if not mobj.group('proto'):
3201             url = 'http://' + url
3202         video_id = mobj.group('videoid')
3203         self.report_webpage(video_id)
3204
3205         request = compat_urllib_request.Request(url)
3206         try:
3207             webpage = compat_urllib_request.urlopen(request).read()
3208         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3209             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3210             return
3211
3212         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3213         if mobj is None:
3214             self._downloader.trouble(u'ERROR: unable to extract song name')
3215             return
3216         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3217         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3218         if mobj is None:
3219             self._downloader.trouble(u'ERROR: unable to extract performer')
3220             return
3221         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3222         video_title = performer + ' - ' + song_name
3223
3224         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3225         if mobj is None:
3226             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3227             return
3228         mtvn_uri = mobj.group(1)
3229
3230         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3231         if mobj is None:
3232             self._downloader.trouble(u'ERROR: unable to extract content id')
3233             return
3234         content_id = mobj.group(1)
3235
3236         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3237         self.report_extraction(video_id)
3238         request = compat_urllib_request.Request(videogen_url)
3239         try:
3240             metadataXml = compat_urllib_request.urlopen(request).read()
3241         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3242             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3243             return
3244
3245         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3246         renditions = mdoc.findall('.//rendition')
3247
3248         # For now, always pick the highest quality.
3249         rendition = renditions[-1]
3250
3251         try:
3252             _,_,ext = rendition.attrib['type'].partition('/')
3253             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3254             video_url = rendition.find('./src').text
3255         except KeyError:
3256             self._downloader.trouble('Invalid rendition field.')
3257             return
3258
3259         info = {
3260             'id': video_id,
3261             'url': video_url,
3262             'uploader': performer,
3263             'upload_date': None,
3264             'title': video_title,
3265             'ext': ext,
3266             'format': format,
3267         }
3268
3269         return [info]
3270
3271
3272 class YoukuIE(InfoExtractor):
3273
3274     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3275     IE_NAME = u'Youku'
3276
3277     def __init__(self, downloader=None):
3278         InfoExtractor.__init__(self, downloader)
3279
3280     def report_download_webpage(self, file_id):
3281         """Report webpage download."""
3282         self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3283
3284     def report_extraction(self, file_id):
3285         """Report information extraction."""
3286         self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3287
3288     def _gen_sid(self):
3289         nowTime = int(time.time() * 1000)
3290         random1 = random.randint(1000,1998)
3291         random2 = random.randint(1000,9999)
3292
3293         return "%d%d%d" %(nowTime,random1,random2)
3294
3295     def _get_file_ID_mix_string(self, seed):
3296         mixed = []
3297         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3298         seed = float(seed)
3299         for i in range(len(source)):
3300             seed  =  (seed * 211 + 30031 ) % 65536
3301             index  =  math.floor(seed / 65536 * len(source) )
3302             mixed.append(source[int(index)])
3303             source.remove(source[int(index)])
3304         #return ''.join(mixed)
3305         return mixed
3306
3307     def _get_file_id(self, fileId, seed):
3308         mixed = self._get_file_ID_mix_string(seed)
3309         ids = fileId.split('*')
3310         realId = []
3311         for ch in ids:
3312             if ch:
3313                 realId.append(mixed[int(ch)])
3314         return ''.join(realId)
3315
3316     def _real_extract(self, url):
3317         mobj = re.match(self._VALID_URL, url)
3318         if mobj is None:
3319             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3320             return
3321         video_id = mobj.group('ID')
3322
3323         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3324
3325         request = compat_urllib_request.Request(info_url, None, std_headers)
3326         try:
3327             self.report_download_webpage(video_id)
3328             jsondata = compat_urllib_request.urlopen(request).read()
3329         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3330             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3331             return
3332
3333         self.report_extraction(video_id)
3334         try:
3335             config = json.loads(jsondata)
3336
3337             video_title =  config['data'][0]['title']
3338             seed = config['data'][0]['seed']
3339
3340             format = self._downloader.params.get('format', None)
3341             supported_format = config['data'][0]['streamfileids'].keys()
3342
3343             if format is None or format == 'best':
3344                 if 'hd2' in supported_format:
3345                     format = 'hd2'
3346                 else:
3347                     format = 'flv'
3348                 ext = u'flv'
3349             elif format == 'worst':
3350                 format = 'mp4'
3351                 ext = u'mp4'
3352             else:
3353                 format = 'flv'
3354                 ext = u'flv'
3355
3356
3357             fileid = config['data'][0]['streamfileids'][format]
3358             seg_number = len(config['data'][0]['segs'][format])
3359
3360             keys=[]
3361             for i in xrange(seg_number):
3362                 keys.append(config['data'][0]['segs'][format][i]['k'])
3363
3364             #TODO check error
3365             #youku only could be viewed from mainland china
3366         except:
3367             self._downloader.trouble(u'ERROR: unable to extract info section')
3368             return
3369
3370         files_info=[]
3371         sid = self._gen_sid()
3372         fileid = self._get_file_id(fileid, seed)
3373
3374         #column 8,9 of fileid represent the segment number
3375         #fileid[7:9] should be changed
3376         for index, key in enumerate(keys):
3377
3378             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3379             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3380
3381             info = {
3382                 'id': '%s_part%02d' % (video_id, index),
3383                 'url': download_url,
3384                 'uploader': None,
3385                 'upload_date': None,
3386                 'title': video_title,
3387                 'ext': ext,
3388             }
3389             files_info.append(info)
3390
3391         return files_info
3392
3393
3394 class XNXXIE(InfoExtractor):
3395     """Information extractor for xnxx.com"""
3396
3397     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3398     IE_NAME = u'xnxx'
3399     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3400     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3401     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3402
3403     def report_webpage(self, video_id):
3404         """Report information extraction"""
3405         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3406
3407     def report_extraction(self, video_id):
3408         """Report information extraction"""
3409         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3410
3411     def _real_extract(self, url):
3412         mobj = re.match(self._VALID_URL, url)
3413         if mobj is None:
3414             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3415             return
3416         video_id = mobj.group(1).decode('utf-8')
3417
3418         self.report_webpage(video_id)
3419
3420         # Get webpage content
3421         try:
3422             webpage = compat_urllib_request.urlopen(url).read()
3423         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3424             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3425             return
3426
3427         result = re.search(self.VIDEO_URL_RE, webpage)
3428         if result is None:
3429             self._downloader.trouble(u'ERROR: unable to extract video url')
3430             return
3431         video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3432
3433         result = re.search(self.VIDEO_TITLE_RE, webpage)
3434         if result is None:
3435             self._downloader.trouble(u'ERROR: unable to extract video title')
3436             return
3437         video_title = result.group(1).decode('utf-8')
3438
3439         result = re.search(self.VIDEO_THUMB_RE, webpage)
3440         if result is None:
3441             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3442             return
3443         video_thumbnail = result.group(1).decode('utf-8')
3444
3445         return [{
3446             'id': video_id,
3447             'url': video_url,
3448             'uploader': None,
3449             'upload_date': None,
3450             'title': video_title,
3451             'ext': 'flv',
3452             'thumbnail': video_thumbnail,
3453             'description': None,
3454         }]
3455
3456
3457 class GooglePlusIE(InfoExtractor):
3458     """Information extractor for plus.google.com."""
3459
3460     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3461     IE_NAME = u'plus.google'
3462
3463     def __init__(self, downloader=None):
3464         InfoExtractor.__init__(self, downloader)
3465
3466     def report_extract_entry(self, url):
3467         """Report downloading extry"""
3468         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3469
3470     def report_date(self, upload_date):
3471         """Report downloading extry"""
3472         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3473
3474     def report_uploader(self, uploader):
3475         """Report downloading extry"""
3476         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3477
3478     def report_title(self, video_title):
3479         """Report downloading extry"""
3480         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3481
3482     def report_extract_vid_page(self, video_page):
3483         """Report information extraction."""
3484         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3485
3486     def _real_extract(self, url):
3487         # Extract id from URL
3488         mobj = re.match(self._VALID_URL, url)
3489         if mobj is None:
3490             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3491             return
3492
3493         post_url = mobj.group(0)
3494         video_id = mobj.group(2)
3495
3496         video_extension = 'flv'
3497
3498         # Step 1, Retrieve post webpage to extract further information
3499         self.report_extract_entry(post_url)
3500         request = compat_urllib_request.Request(post_url)
3501         try:
3502             webpage = compat_urllib_request.urlopen(request).read()
3503         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3504             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3505             return
3506
3507         # Extract update date
3508         upload_date = None
3509         pattern = 'title="Timestamp">(.*?)</a>'
3510         mobj = re.search(pattern, webpage)
3511         if mobj:
3512             upload_date = mobj.group(1)
3513             # Convert timestring to a format suitable for filename
3514             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3515             upload_date = upload_date.strftime('%Y%m%d')
3516         self.report_date(upload_date)
3517
3518         # Extract uploader
3519         uploader = None
3520         pattern = r'rel\="author".*?>(.*?)</a>'
3521         mobj = re.search(pattern, webpage)
3522         if mobj:
3523             uploader = mobj.group(1)
3524         self.report_uploader(uploader)
3525
3526         # Extract title
3527         # Get the first line for title
3528         video_title = u'NA'
3529         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3530         mobj = re.search(pattern, webpage)
3531         if mobj:
3532             video_title = mobj.group(1)
3533         self.report_title(video_title)
3534
3535         # Step 2, Stimulate clicking the image box to launch video
3536         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3537         mobj = re.search(pattern, webpage)
3538         if mobj is None:
3539             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3540
3541         video_page = mobj.group(1)
3542         request = compat_urllib_request.Request(video_page)
3543         try:
3544             webpage = compat_urllib_request.urlopen(request).read()
3545         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3546             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3547             return
3548         self.report_extract_vid_page(video_page)
3549
3550
3551         # Extract video links on video page
3552         """Extract video links of all sizes"""
3553         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3554         mobj = re.findall(pattern, webpage)
3555         if len(mobj) == 0:
3556             self._downloader.trouble(u'ERROR: unable to extract video links')
3557
3558         # Sort in resolution
3559         links = sorted(mobj)
3560
3561         # Choose the lowest of the sort, i.e. highest resolution
3562         video_url = links[-1]
3563         # Only get the url. The resolution part in the tuple has no use anymore
3564         video_url = video_url[-1]
3565         # Treat escaped \u0026 style hex
3566         video_url = unicode(video_url, "unicode_escape")
3567
3568
3569         return [{
3570             'id':       video_id.decode('utf-8'),
3571             'url':      video_url,
3572             'uploader': uploader.decode('utf-8'),
3573             'upload_date':  upload_date.decode('utf-8'),
3574             'title':    video_title.decode('utf-8'),
3575             'ext':      video_extension.decode('utf-8'),
3576         }]