youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import datetime
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import email.utils
  13 import xml.etree.ElementTree
  14 import random
  15 import math
  16
  17 from .utils import *
  18
  19
  20 class InfoExtractor(object):
  21     """Information Extractor class.
  22
  23     Information extractors are the classes that, given a URL, extract
  24     information about the video (or videos) the URL refers to. This
  25     information includes the real video URL, the video title, author and
  26     others. The information is stored in a dictionary which is then
  27     passed to the FileDownloader. The FileDownloader processes this
  28     information possibly downloading the video to the file system, among
  29     other possible outcomes.
  30
  31     The dictionaries must include the following fields:
  32
  33     id:             Video identifier.
  34     url:            Final video URL.
  35     title:          Video title, unescaped.
  36     ext:            Video filename extension.
  37     uploader:       Full name of the video uploader.
  38     upload_date:    Video upload date (YYYYMMDD).
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader_id:    Nickname or id of the video uploader.
  46     player_url:     SWF Player URL (used for rtmpdump).
  47     subtitles:      The .srt file contents.
  48     urlhandle:      [internal] The urlHandle to be used to download the file,
  49                     like returned by urllib.request.urlopen
  50
  51     The fields should all be Unicode strings.
  52
  53     Subclasses of this one should re-define the _real_initialize() and
  54     _real_extract() methods and define a _VALID_URL regexp.
  55     Probably, they should also be added to the list of extractors.
  56
  57     _real_extract() must return a *list* of information dictionaries as
  58     described above.
  59
  60     Finally, the _WORKING attribute should be set to False for broken IEs
  61     in order to warn the users and skip the tests.
  62     """
  63
  64     _ready = False
  65     _downloader = None
  66     _WORKING = True
  67
  68     def __init__(self, downloader=None):
  69         """Constructor. Receives an optional downloader."""
  70         self._ready = False
  71         self.set_downloader(downloader)
  72
  73     def suitable(self, url):
  74         """Receives a URL and returns True if suitable for this IE."""
  75         return re.match(self._VALID_URL, url) is not None
  76
  77     def working(self):
  78         """Getter method for _WORKING."""
  79         return self._WORKING
  80
  81     def initialize(self):
  82         """Initializes an instance (authentication, etc)."""
  83         if not self._ready:
  84             self._real_initialize()
  85             self._ready = True
  86
  87     def extract(self, url):
  88         """Extracts URL information and returns it in list of dicts."""
  89         self.initialize()
  90         return self._real_extract(url)
  91
  92     def set_downloader(self, downloader):
  93         """Sets the downloader for this IE."""
  94         self._downloader = downloader
  95
  96     def _real_initialize(self):
  97         """Real initialization process. Redefine in subclasses."""
  98         pass
  99
 100     def _real_extract(self, url):
 101         """Real extraction process. Redefine in subclasses."""
 102         pass
 103
 104     @property
 105     def IE_NAME(self):
 106         return type(self).__name__[:-2]
 107
 108 class YoutubeIE(InfoExtractor):
 109     """Information extractor for youtube.com."""
 110
 111     _VALID_URL = r"""^
 112                      (
 113                          (?:https?://)?                                       # http(s):// (optional)
 114                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 115                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 116                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 117                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 118                          (?:                                                  # the various things that can precede the ID:
 119                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 120                              |(?:                                             # or the v= param in all its forms
 121                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 122                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 123                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 124                                  v=
 125                              )
 126                          )?                                                   # optional -> youtube.com/xxxx is OK
 127                      )?                                                       # all until now is optional -> you can pass the naked ID
 128                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 129                      (?(1).+)?                                                # if we found the ID, everything can follow
 130                      $"""
 131     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 132     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 133     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 134     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 135     _NETRC_MACHINE = 'youtube'
 136     # Listed in order of quality
 137     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 138     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 139     _video_extensions = {
 140         '13': '3gp',
 141         '17': 'mp4',
 142         '18': 'mp4',
 143         '22': 'mp4',
 144         '37': 'mp4',
 145         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 146         '43': 'webm',
 147         '44': 'webm',
 148         '45': 'webm',
 149         '46': 'webm',
 150     }
 151     _video_dimensions = {
 152         '5': '240x400',
 153         '6': '???',
 154         '13': '???',
 155         '17': '144x176',
 156         '18': '360x640',
 157         '22': '720x1280',
 158         '34': '360x640',
 159         '35': '480x854',
 160         '37': '1080x1920',
 161         '38': '3072x4096',
 162         '43': '360x640',
 163         '44': '480x854',
 164         '45': '720x1280',
 165         '46': '1080x1920',
 166     }
 167     IE_NAME = u'youtube'
 168
 169     def suitable(self, url):
 170         """Receives a URL and returns True if suitable for this IE."""
 171         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 172
 173     def report_lang(self):
 174         """Report attempt to set language."""
 175         self._downloader.to_screen(u'[youtube] Setting language')
 176
 177     def report_login(self):
 178         """Report attempt to log in."""
 179         self._downloader.to_screen(u'[youtube] Logging in')
 180
 181     def report_age_confirmation(self):
 182         """Report attempt to confirm age."""
 183         self._downloader.to_screen(u'[youtube] Confirming age')
 184
 185     def report_video_webpage_download(self, video_id):
 186         """Report attempt to download video webpage."""
 187         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 188
 189     def report_video_info_webpage_download(self, video_id):
 190         """Report attempt to download video info webpage."""
 191         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 192
 193     def report_video_subtitles_download(self, video_id):
 194         """Report attempt to download video info webpage."""
 195         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 196
 197     def report_information_extraction(self, video_id):
 198         """Report attempt to extract video information."""
 199         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 200
 201     def report_unavailable_format(self, video_id, format):
 202         """Report extracted video URL."""
 203         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 204
 205     def report_rtmp_download(self):
 206         """Indicate the download will use the RTMP protocol."""
 207         self._downloader.to_screen(u'[youtube] RTMP download detected')
 208
 209     def _closed_captions_xml_to_srt(self, xml_string):
 210         srt = ''
 211         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 212         # TODO parse xml instead of regex
 213         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 214             if not dur: dur = '4'
 215             start = float(start)
 216             end = start + float(dur)
 217             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 218             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 219             caption = unescapeHTML(caption)
 220             caption = unescapeHTML(caption) # double cycle, intentional
 221             srt += str(n+1) + '\n'
 222             srt += start + ' --> ' + end + '\n'
 223             srt += caption + '\n\n'
 224         return srt
 225
 226     def _extract_subtitles(self, video_id):
 227         self.report_video_subtitles_download(video_id)
 228         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 229         try:
 230             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 231         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 232             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 233         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 234         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 235         if not srt_lang_list:
 236             return (u'WARNING: video has no closed captions', None)
 237         if self._downloader.params.get('subtitleslang', False):
 238             srt_lang = self._downloader.params.get('subtitleslang')
 239         elif 'en' in srt_lang_list:
 240             srt_lang = 'en'
 241         else:
 242             srt_lang = list(srt_lang_list.keys())[0]
 243         if not srt_lang in srt_lang_list:
 244             return (u'WARNING: no closed captions found in the specified language', None)
 245         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 246         try:
 247             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 248         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 249             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 250         if not srt_xml:
 251             return (u'WARNING: unable to download video subtitles', None)
 252         return (None, self._closed_captions_xml_to_srt(srt_xml))
 253
 254     def _print_formats(self, formats):
 255         print('Available formats:')
 256         for x in formats:
 257             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 258
 259     def _real_initialize(self):
 260         if self._downloader is None:
 261             return
 262
 263         username = None
 264         password = None
 265         downloader_params = self._downloader.params
 266
 267         # Attempt to use provided username and password or .netrc data
 268         if downloader_params.get('username', None) is not None:
 269             username = downloader_params['username']
 270             password = downloader_params['password']
 271         elif downloader_params.get('usenetrc', False):
 272             try:
 273                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 274                 if info is not None:
 275                     username = info[0]
 276                     password = info[2]
 277                 else:
 278                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 279             except (IOError, netrc.NetrcParseError) as err:
 280                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 281                 return
 282
 283         # Set language
 284         request = compat_urllib_request.Request(self._LANG_URL)
 285         try:
 286             self.report_lang()
 287             compat_urllib_request.urlopen(request).read()
 288         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 289             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 290             return
 291
 292         # No authentication to be performed
 293         if username is None:
 294             return
 295
 296         # Log in
 297         login_form = {
 298                 'current_form': 'loginForm',
 299                 'next':     '/',
 300                 'action_login': 'Log In',
 301                 'username': username,
 302                 'password': password,
 303                 }
 304         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 305         try:
 306             self.report_login()
 307             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 308             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 309                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 310                 return
 311         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 312             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 313             return
 314
 315         # Confirm age
 316         age_form = {
 317                 'next_url':     '/',
 318                 'action_confirm':   'Confirm',
 319                 }
 320         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 321         try:
 322             self.report_age_confirmation()
 323             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 324         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 325             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 326             return
 327
 328     def _extract_id(self, url):
 329         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 330         if mobj is None:
 331             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 332             return
 333         video_id = mobj.group(2)
 334         return video_id
 335
 336     def _real_extract(self, url):
 337         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 338         mobj = re.search(self._NEXT_URL_RE, url)
 339         if mobj:
 340             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 341         video_id = self._extract_id(url)
 342
 343         # Get video webpage
 344         self.report_video_webpage_download(video_id)
 345         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 346         request = compat_urllib_request.Request(url)
 347         try:
 348             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 349         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 350             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 351             return
 352
 353         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 354
 355         # Attempt to extract SWF player URL
 356         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 357         if mobj is not None:
 358             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 359         else:
 360             player_url = None
 361
 362         # Get video info
 363         self.report_video_info_webpage_download(video_id)
 364         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 365             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 366                     % (video_id, el_type))
 367             request = compat_urllib_request.Request(video_info_url)
 368             try:
 369                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 370                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 371                 video_info = compat_parse_qs(video_info_webpage)
 372                 if 'token' in video_info:
 373                     break
 374             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 375                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 376                 return
 377         if 'token' not in video_info:
 378             if 'reason' in video_info:
 379                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 380             else:
 381                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 382             return
 383
 384         # Check for "rental" videos
 385         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 386             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 387             return
 388
 389         # Start extracting information
 390         self.report_information_extraction(video_id)
 391
 392         # uploader
 393         if 'author' not in video_info:
 394             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 395             return
 396         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 397
 398         # uploader_id
 399         video_uploader_id = None
 400         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
 401         if mobj is not None:
 402             video_uploader_id = mobj.group(1)
 403         else:
 404             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 405
 406         # title
 407         if 'title' not in video_info:
 408             self._downloader.trouble(u'ERROR: unable to extract video title')
 409             return
 410         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 411
 412         # thumbnail image
 413         if 'thumbnail_url' not in video_info:
 414             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 415             video_thumbnail = ''
 416         else:   # don't panic if we can't find it
 417             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 418
 419         # upload date
 420         upload_date = None
 421         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 422         if mobj is not None:
 423             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 424             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 425             for expression in format_expressions:
 426                 try:
 427                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 428                 except:
 429                     pass
 430
 431         # description
 432         video_description = get_element_by_id("eow-description", video_webpage)
 433         if video_description:
 434             video_description = clean_html(video_description)
 435         else:
 436             video_description = ''
 437
 438         # closed captions
 439         video_subtitles = None
 440         if self._downloader.params.get('writesubtitles', False):
 441             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 442             if srt_error:
 443                 self._downloader.trouble(srt_error)
 444
 445         if 'length_seconds' not in video_info:
 446             self._downloader.trouble(u'WARNING: unable to extract video duration')
 447             video_duration = ''
 448         else:
 449             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 450
 451         # token
 452         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 453
 454         # Decide which formats to download
 455         req_format = self._downloader.params.get('format', None)
 456
 457         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 458             self.report_rtmp_download()
 459             video_url_list = [(None, video_info['conn'][0])]
 460         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 461             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 462             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 463             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 464             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 465
 466             format_limit = self._downloader.params.get('format_limit', None)
 467             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 468             if format_limit is not None and format_limit in available_formats:
 469                 format_list = available_formats[available_formats.index(format_limit):]
 470             else:
 471                 format_list = available_formats
 472             existing_formats = [x for x in format_list if x in url_map]
 473             if len(existing_formats) == 0:
 474                 self._downloader.trouble(u'ERROR: no known formats available for video')
 475                 return
 476             if self._downloader.params.get('listformats', None):
 477                 self._print_formats(existing_formats)
 478                 return
 479             if req_format is None or req_format == 'best':
 480                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 481             elif req_format == 'worst':
 482                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 483             elif req_format in ('-1', 'all'):
 484                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 485             else:
 486                 # Specific formats. We pick the first in a slash-delimeted sequence.
 487                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 488                 req_formats = req_format.split('/')
 489                 video_url_list = None
 490                 for rf in req_formats:
 491                     if rf in url_map:
 492                         video_url_list = [(rf, url_map[rf])]
 493                         break
 494                 if video_url_list is None:
 495                     self._downloader.trouble(u'ERROR: requested format not available')
 496                     return
 497         else:
 498             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 499             return
 500
 501         results = []
 502         for format_param, video_real_url in video_url_list:
 503             # Extension
 504             video_extension = self._video_extensions.get(format_param, 'flv')
 505
 506             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 507                                               self._video_dimensions.get(format_param, '???'))
 508
 509             results.append({
 510                 'id':       video_id,
 511                 'url':      video_real_url,
 512                 'uploader': video_uploader,
 513                 'uploader_id': video_uploader_id,
 514                 'upload_date':  upload_date,
 515                 'title':    video_title,
 516                 'ext':      video_extension,
 517                 'format':   video_format,
 518                 'thumbnail':    video_thumbnail,
 519                 'description':  video_description,
 520                 'player_url':   player_url,
 521                 'subtitles':    video_subtitles,
 522                 'duration':     video_duration
 523             })
 524         return results
 525
 526
 527 class MetacafeIE(InfoExtractor):
 528     """Information Extractor for metacafe.com."""
 529
 530     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 531     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 532     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 533     IE_NAME = u'metacafe'
 534
 535     def __init__(self, downloader=None):
 536         InfoExtractor.__init__(self, downloader)
 537
 538     def report_disclaimer(self):
 539         """Report disclaimer retrieval."""
 540         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 541
 542     def report_age_confirmation(self):
 543         """Report attempt to confirm age."""
 544         self._downloader.to_screen(u'[metacafe] Confirming age')
 545
 546     def report_download_webpage(self, video_id):
 547         """Report webpage download."""
 548         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 549
 550     def report_extraction(self, video_id):
 551         """Report information extraction."""
 552         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 553
 554     def _real_initialize(self):
 555         # Retrieve disclaimer
 556         request = compat_urllib_request.Request(self._DISCLAIMER)
 557         try:
 558             self.report_disclaimer()
 559             disclaimer = compat_urllib_request.urlopen(request).read()
 560         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 561             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 562             return
 563
 564         # Confirm age
 565         disclaimer_form = {
 566             'filters': '0',
 567             'submit': "Continue - I'm over 18",
 568             }
 569         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 570         try:
 571             self.report_age_confirmation()
 572             disclaimer = compat_urllib_request.urlopen(request).read()
 573         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 574             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 575             return
 576
 577     def _real_extract(self, url):
 578         # Extract id and simplified title from URL
 579         mobj = re.match(self._VALID_URL, url)
 580         if mobj is None:
 581             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 582             return
 583
 584         video_id = mobj.group(1)
 585
 586         # Check if video comes from YouTube
 587         mobj2 = re.match(r'^yt-(.*)$', video_id)
 588         if mobj2 is not None:
 589             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 590             return
 591
 592         # Retrieve video webpage to extract further information
 593         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 594         try:
 595             self.report_download_webpage(video_id)
 596             webpage = compat_urllib_request.urlopen(request).read()
 597         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 598             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 599             return
 600
 601         # Extract URL, uploader and title from webpage
 602         self.report_extraction(video_id)
 603         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 604         if mobj is not None:
 605             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 606             video_extension = mediaURL[-3:]
 607
 608             # Extract gdaKey if available
 609             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 610             if mobj is None:
 611                 video_url = mediaURL
 612             else:
 613                 gdaKey = mobj.group(1)
 614                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 615         else:
 616             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 617             if mobj is None:
 618                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 619                 return
 620             vardict = compat_parse_qs(mobj.group(1))
 621             if 'mediaData' not in vardict:
 622                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 623                 return
 624             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 625             if mobj is None:
 626                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 627                 return
 628             mediaURL = mobj.group(1).replace('\\/', '/')
 629             video_extension = mediaURL[-3:]
 630             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 631
 632         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 633         if mobj is None:
 634             self._downloader.trouble(u'ERROR: unable to extract title')
 635             return
 636         video_title = mobj.group(1).decode('utf-8')
 637
 638         mobj = re.search(r'submitter=(.*?);', webpage)
 639         if mobj is None:
 640             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 641             return
 642         video_uploader = mobj.group(1)
 643
 644         return [{
 645             'id':       video_id.decode('utf-8'),
 646             'url':      video_url.decode('utf-8'),
 647             'uploader': video_uploader.decode('utf-8'),
 648             'upload_date':  None,
 649             'title':    video_title,
 650             'ext':      video_extension.decode('utf-8'),
 651         }]
 652
 653
 654 class DailymotionIE(InfoExtractor):
 655     """Information Extractor for Dailymotion"""
 656
 657     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 658     IE_NAME = u'dailymotion'
 659
 660     def __init__(self, downloader=None):
 661         InfoExtractor.__init__(self, downloader)
 662
 663     def report_download_webpage(self, video_id):
 664         """Report webpage download."""
 665         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 666
 667     def report_extraction(self, video_id):
 668         """Report information extraction."""
 669         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 670
 671     def _real_extract(self, url):
 672         # Extract id and simplified title from URL
 673         mobj = re.match(self._VALID_URL, url)
 674         if mobj is None:
 675             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 676             return
 677
 678         video_id = mobj.group(1).split('_')[0].split('?')[0]
 679
 680         video_extension = 'mp4'
 681
 682         # Retrieve video webpage to extract further information
 683         request = compat_urllib_request.Request(url)
 684         request.add_header('Cookie', 'family_filter=off')
 685         try:
 686             self.report_download_webpage(video_id)
 687             webpage_bytes = compat_urllib_request.urlopen(request).read()
 688             webpage = webpage_bytes.decode('utf-8')
 689         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 690             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 691             return
 692
 693         # Extract URL, uploader and title from webpage
 694         self.report_extraction(video_id)
 695         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 696         if mobj is None:
 697             self._downloader.trouble(u'ERROR: unable to extract media URL')
 698             return
 699         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 700
 701         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 702             if key in flashvars:
 703                 max_quality = key
 704                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 705                 break
 706         else:
 707             self._downloader.trouble(u'ERROR: unable to extract video URL')
 708             return
 709
 710         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 711         if mobj is None:
 712             self._downloader.trouble(u'ERROR: unable to extract video URL')
 713             return
 714
 715         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 716
 717         # TODO: support choosing qualities
 718
 719         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 720         if mobj is None:
 721             self._downloader.trouble(u'ERROR: unable to extract title')
 722             return
 723         video_title = unescapeHTML(mobj.group('title'))
 724
 725         video_uploader = None
 726         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 727         if mobj is None:
 728             # lookin for official user
 729             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 730             if mobj_official is None:
 731                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 732             else:
 733                 video_uploader = mobj_official.group(1)
 734         else:
 735             video_uploader = mobj.group(1)
 736
 737         video_upload_date = None
 738         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 739         if mobj is not None:
 740             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 741
 742         return [{
 743             'id':       video_id,
 744             'url':      video_url,
 745             'uploader': video_uploader,
 746             'upload_date':  video_upload_date,
 747             'title':    video_title,
 748             'ext':      video_extension,
 749         }]
 750
 751
 752 class PhotobucketIE(InfoExtractor):
 753     """Information extractor for photobucket.com."""
 754
 755     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 756     IE_NAME = u'photobucket'
 757
 758     def __init__(self, downloader=None):
 759         InfoExtractor.__init__(self, downloader)
 760
 761     def report_download_webpage(self, video_id):
 762         """Report webpage download."""
 763         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 764
 765     def report_extraction(self, video_id):
 766         """Report information extraction."""
 767         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 768
 769     def _real_extract(self, url):
 770         # Extract id from URL
 771         mobj = re.match(self._VALID_URL, url)
 772         if mobj is None:
 773             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 774             return
 775
 776         video_id = mobj.group(1)
 777
 778         video_extension = 'flv'
 779
 780         # Retrieve video webpage to extract further information
 781         request = compat_urllib_request.Request(url)
 782         try:
 783             self.report_download_webpage(video_id)
 784             webpage = compat_urllib_request.urlopen(request).read()
 785         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 786             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 787             return
 788
 789         # Extract URL, uploader, and title from webpage
 790         self.report_extraction(video_id)
 791         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 792         if mobj is None:
 793             self._downloader.trouble(u'ERROR: unable to extract media URL')
 794             return
 795         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 796
 797         video_url = mediaURL
 798
 799         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 800         if mobj is None:
 801             self._downloader.trouble(u'ERROR: unable to extract title')
 802             return
 803         video_title = mobj.group(1).decode('utf-8')
 804
 805         video_uploader = mobj.group(2).decode('utf-8')
 806
 807         return [{
 808             'id':       video_id.decode('utf-8'),
 809             'url':      video_url.decode('utf-8'),
 810             'uploader': video_uploader,
 811             'upload_date':  None,
 812             'title':    video_title,
 813             'ext':      video_extension.decode('utf-8'),
 814         }]
 815
 816
 817 class YahooIE(InfoExtractor):
 818     """Information extractor for video.yahoo.com."""
 819
 820     _WORKING = False
 821     # _VALID_URL matches all Yahoo! Video URLs
 822     # _VPAGE_URL matches only the extractable '/watch/' URLs
 823     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 824     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 825     IE_NAME = u'video.yahoo'
 826
 827     def __init__(self, downloader=None):
 828         InfoExtractor.__init__(self, downloader)
 829
 830     def report_download_webpage(self, video_id):
 831         """Report webpage download."""
 832         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 833
 834     def report_extraction(self, video_id):
 835         """Report information extraction."""
 836         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 837
 838     def _real_extract(self, url, new_video=True):
 839         # Extract ID from URL
 840         mobj = re.match(self._VALID_URL, url)
 841         if mobj is None:
 842             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 843             return
 844
 845         video_id = mobj.group(2)
 846         video_extension = 'flv'
 847
 848         # Rewrite valid but non-extractable URLs as
 849         # extractable English language /watch/ URLs
 850         if re.match(self._VPAGE_URL, url) is None:
 851             request = compat_urllib_request.Request(url)
 852             try:
 853                 webpage = compat_urllib_request.urlopen(request).read()
 854             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 855                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 856                 return
 857
 858             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 859             if mobj is None:
 860                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 861                 return
 862             yahoo_id = mobj.group(1)
 863
 864             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 865             if mobj is None:
 866                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 867                 return
 868             yahoo_vid = mobj.group(1)
 869
 870             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 871             return self._real_extract(url, new_video=False)
 872
 873         # Retrieve video webpage to extract further information
 874         request = compat_urllib_request.Request(url)
 875         try:
 876             self.report_download_webpage(video_id)
 877             webpage = compat_urllib_request.urlopen(request).read()
 878         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 879             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 880             return
 881
 882         # Extract uploader and title from webpage
 883         self.report_extraction(video_id)
 884         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 885         if mobj is None:
 886             self._downloader.trouble(u'ERROR: unable to extract video title')
 887             return
 888         video_title = mobj.group(1).decode('utf-8')
 889
 890         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 891         if mobj is None:
 892             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 893             return
 894         video_uploader = mobj.group(1).decode('utf-8')
 895
 896         # Extract video thumbnail
 897         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 898         if mobj is None:
 899             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 900             return
 901         video_thumbnail = mobj.group(1).decode('utf-8')
 902
 903         # Extract video description
 904         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 905         if mobj is None:
 906             self._downloader.trouble(u'ERROR: unable to extract video description')
 907             return
 908         video_description = mobj.group(1).decode('utf-8')
 909         if not video_description:
 910             video_description = 'No description available.'
 911
 912         # Extract video height and width
 913         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 914         if mobj is None:
 915             self._downloader.trouble(u'ERROR: unable to extract video height')
 916             return
 917         yv_video_height = mobj.group(1)
 918
 919         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 920         if mobj is None:
 921             self._downloader.trouble(u'ERROR: unable to extract video width')
 922             return
 923         yv_video_width = mobj.group(1)
 924
 925         # Retrieve video playlist to extract media URL
 926         # I'm not completely sure what all these options are, but we
 927         # seem to need most of them, otherwise the server sends a 401.
 928         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 929         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 930         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 931                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 932                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 933         try:
 934             self.report_download_webpage(video_id)
 935             webpage = compat_urllib_request.urlopen(request).read()
 936         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 937             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 938             return
 939
 940         # Extract media URL from playlist XML
 941         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 942         if mobj is None:
 943             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 944             return
 945         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 946         video_url = unescapeHTML(video_url)
 947
 948         return [{
 949             'id':       video_id.decode('utf-8'),
 950             'url':      video_url,
 951             'uploader': video_uploader,
 952             'upload_date':  None,
 953             'title':    video_title,
 954             'ext':      video_extension.decode('utf-8'),
 955             'thumbnail':    video_thumbnail.decode('utf-8'),
 956             'description':  video_description,
 957         }]
 958
 959
 960 class VimeoIE(InfoExtractor):
 961     """Information extractor for vimeo.com."""
 962
 963     # _VALID_URL matches Vimeo URLs
 964     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 965     IE_NAME = u'vimeo'
 966
 967     def __init__(self, downloader=None):
 968         InfoExtractor.__init__(self, downloader)
 969
 970     def report_download_webpage(self, video_id):
 971         """Report webpage download."""
 972         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 973
 974     def report_extraction(self, video_id):
 975         """Report information extraction."""
 976         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 977
 978     def _real_extract(self, url, new_video=True):
 979         # Extract ID from URL
 980         mobj = re.match(self._VALID_URL, url)
 981         if mobj is None:
 982             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 983             return
 984
 985         video_id = mobj.group(1)
 986
 987         # Retrieve video webpage to extract further information
 988         request = compat_urllib_request.Request(url, None, std_headers)
 989         try:
 990             self.report_download_webpage(video_id)
 991             webpage_bytes = compat_urllib_request.urlopen(request).read()
 992             webpage = webpage_bytes.decode('utf-8')
 993         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 994             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 995             return
 996
 997         # Now we begin extracting as much information as we can from what we
 998         # retrieved. First we extract the information common to all extractors,
 999         # and latter we extract those that are Vimeo specific.
1000         self.report_extraction(video_id)
1001
1002         # Extract the config JSON
1003         try:
1004             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1005             config = json.loads(config)
1006         except:
1007             self._downloader.trouble(u'ERROR: unable to extract info section')
1008             return
1009
1010         # Extract title
1011         video_title = config["video"]["title"]
1012
1013         # Extract uploader and uploader_id
1014         video_uploader = config["video"]["owner"]["name"]
1015         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1016
1017         # Extract video thumbnail
1018         video_thumbnail = config["video"]["thumbnail"]
1019
1020         # Extract video description
1021         video_description = get_element_by_attribute("itemprop", "description", webpage)
1022         if video_description: video_description = clean_html(video_description)
1023         else: video_description = ''
1024
1025         # Extract upload date
1026         video_upload_date = None
1027         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1028         if mobj is not None:
1029             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1030
1031         # Vimeo specific: extract request signature and timestamp
1032         sig = config['request']['signature']
1033         timestamp = config['request']['timestamp']
1034
1035         # Vimeo specific: extract video codec and quality information
1036         # First consider quality, then codecs, then take everything
1037         # TODO bind to format param
1038         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1039         files = { 'hd': [], 'sd': [], 'other': []}
1040         for codec_name, codec_extension in codecs:
1041             if codec_name in config["video"]["files"]:
1042                 if 'hd' in config["video"]["files"][codec_name]:
1043                     files['hd'].append((codec_name, codec_extension, 'hd'))
1044                 elif 'sd' in config["video"]["files"][codec_name]:
1045                     files['sd'].append((codec_name, codec_extension, 'sd'))
1046                 else:
1047                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1048
1049         for quality in ('hd', 'sd', 'other'):
1050             if len(files[quality]) > 0:
1051                 video_quality = files[quality][0][2]
1052                 video_codec = files[quality][0][0]
1053                 video_extension = files[quality][0][1]
1054                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1055                 break
1056         else:
1057             self._downloader.trouble(u'ERROR: no known codec found')
1058             return
1059
1060         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1061                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1062
1063         return [{
1064             'id':       video_id,
1065             'url':      video_url,
1066             'uploader': video_uploader,
1067             'uploader_id': video_uploader_id,
1068             'upload_date':  video_upload_date,
1069             'title':    video_title,
1070             'ext':      video_extension,
1071             'thumbnail':    video_thumbnail,
1072             'description':  video_description,
1073         }]
1074
1075
1076 class ArteTvIE(InfoExtractor):
1077     """arte.tv information extractor."""
1078
1079     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1080     _LIVE_URL = r'index-[0-9]+\.html$'
1081
1082     IE_NAME = u'arte.tv'
1083
1084     def __init__(self, downloader=None):
1085         InfoExtractor.__init__(self, downloader)
1086
1087     def report_download_webpage(self, video_id):
1088         """Report webpage download."""
1089         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1090
1091     def report_extraction(self, video_id):
1092         """Report information extraction."""
1093         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1094
1095     def fetch_webpage(self, url):
1096         request = compat_urllib_request.Request(url)
1097         try:
1098             self.report_download_webpage(url)
1099             webpage = compat_urllib_request.urlopen(request).read()
1100         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1101             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1102             return
1103         except ValueError as err:
1104             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1105             return
1106         return webpage
1107
1108     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1109         page = self.fetch_webpage(url)
1110         mobj = re.search(regex, page, regexFlags)
1111         info = {}
1112
1113         if mobj is None:
1114             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1115             return
1116
1117         for (i, key, err) in matchTuples:
1118             if mobj.group(i) is None:
1119                 self._downloader.trouble(err)
1120                 return
1121             else:
1122                 info[key] = mobj.group(i)
1123
1124         return info
1125
1126     def extractLiveStream(self, url):
1127         video_lang = url.split('/')[-4]
1128         info = self.grep_webpage(
1129             url,
1130             r'src="(.*?/videothek_js.*?\.js)',
1131             0,
1132             [
1133                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1134             ]
1135         )
1136         http_host = url.split('/')[2]
1137         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1138         info = self.grep_webpage(
1139             next_url,
1140             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1141                 '(http://.*?\.swf).*?' +
1142                 '(rtmp://.*?)\'',
1143             re.DOTALL,
1144             [
1145                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1146                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1147                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1148             ]
1149         )
1150         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1151
1152     def extractPlus7Stream(self, url):
1153         video_lang = url.split('/')[-3]
1154         info = self.grep_webpage(
1155             url,
1156             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1157             0,
1158             [
1159                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1160             ]
1161         )
1162         next_url = compat_urllib_parse.unquote(info.get('url'))
1163         info = self.grep_webpage(
1164             next_url,
1165             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1166             0,
1167             [
1168                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1169             ]
1170         )
1171         next_url = compat_urllib_parse.unquote(info.get('url'))
1172
1173         info = self.grep_webpage(
1174             next_url,
1175             r'<video id="(.*?)".*?>.*?' +
1176                 '<name>(.*?)</name>.*?' +
1177                 '<dateVideo>(.*?)</dateVideo>.*?' +
1178                 '<url quality="hd">(.*?)</url>',
1179             re.DOTALL,
1180             [
1181                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1182                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1183                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1184                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1185             ]
1186         )
1187
1188         return {
1189             'id':           info.get('id'),
1190             'url':          compat_urllib_parse.unquote(info.get('url')),
1191             'uploader':     u'arte.tv',
1192             'upload_date':  info.get('date'),
1193             'title':        info.get('title').decode('utf-8'),
1194             'ext':          u'mp4',
1195             'format':       u'NA',
1196             'player_url':   None,
1197         }
1198
1199     def _real_extract(self, url):
1200         video_id = url.split('/')[-1]
1201         self.report_extraction(video_id)
1202
1203         if re.search(self._LIVE_URL, video_id) is not None:
1204             self.extractLiveStream(url)
1205             return
1206         else:
1207             info = self.extractPlus7Stream(url)
1208
1209         return [info]
1210
1211
1212 class GenericIE(InfoExtractor):
1213     """Generic last-resort information extractor."""
1214
1215     _VALID_URL = r'.*'
1216     IE_NAME = u'generic'
1217
1218     def __init__(self, downloader=None):
1219         InfoExtractor.__init__(self, downloader)
1220
1221     def report_download_webpage(self, video_id):
1222         """Report webpage download."""
1223         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1224         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1225
1226     def report_extraction(self, video_id):
1227         """Report information extraction."""
1228         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1229
1230     def report_following_redirect(self, new_url):
1231         """Report information extraction."""
1232         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1233
1234     def _test_redirect(self, url):
1235         """Check if it is a redirect, like url shorteners, in case restart chain."""
1236         class HeadRequest(compat_urllib_request.Request):
1237             def get_method(self):
1238                 return "HEAD"
1239
1240         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1241             """
1242             Subclass the HTTPRedirectHandler to make it use our
1243             HeadRequest also on the redirected URL
1244             """
1245             def redirect_request(self, req, fp, code, msg, headers, newurl):
1246                 if code in (301, 302, 303, 307):
1247                     newurl = newurl.replace(' ', '%20')
1248                     newheaders = dict((k,v) for k,v in req.headers.items()
1249                                       if k.lower() not in ("content-length", "content-type"))
1250                     return HeadRequest(newurl,
1251                                        headers=newheaders,
1252                                        origin_req_host=req.get_origin_req_host(),
1253                                        unverifiable=True)
1254                 else:
1255                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1256
1257         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1258             """
1259             Fallback to GET if HEAD is not allowed (405 HTTP error)
1260             """
1261             def http_error_405(self, req, fp, code, msg, headers):
1262                 fp.read()
1263                 fp.close()
1264
1265                 newheaders = dict((k,v) for k,v in req.headers.items()
1266                                   if k.lower() not in ("content-length", "content-type"))
1267                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1268                                                  headers=newheaders,
1269                                                  origin_req_host=req.get_origin_req_host(),
1270                                                  unverifiable=True))
1271
1272         # Build our opener
1273         opener = compat_urllib_request.OpenerDirector()
1274         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1275                         HTTPMethodFallback, HEADRedirectHandler,
1276                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1277             opener.add_handler(handler())
1278
1279         response = opener.open(HeadRequest(url))
1280         new_url = response.geturl()
1281
1282         if url == new_url:
1283             return False
1284
1285         self.report_following_redirect(new_url)
1286         self._downloader.download([new_url])
1287         return True
1288
1289     def _real_extract(self, url):
1290         if self._test_redirect(url): return
1291
1292         video_id = url.split('/')[-1]
1293         request = compat_urllib_request.Request(url)
1294         try:
1295             self.report_download_webpage(video_id)
1296             webpage = compat_urllib_request.urlopen(request).read()
1297         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1298             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1299             return
1300         except ValueError as err:
1301             # since this is the last-resort InfoExtractor, if
1302             # this error is thrown, it'll be thrown here
1303             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1304             return
1305
1306         self.report_extraction(video_id)
1307         # Start with something easy: JW Player in SWFObject
1308         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1309         if mobj is None:
1310             # Broaden the search a little bit
1311             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1312         if mobj is None:
1313             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1314             return
1315
1316         # It's possible that one of the regexes
1317         # matched, but returned an empty group:
1318         if mobj.group(1) is None:
1319             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1320             return
1321
1322         video_url = compat_urllib_parse.unquote(mobj.group(1))
1323         video_id = os.path.basename(video_url)
1324
1325         # here's a fun little line of code for you:
1326         video_extension = os.path.splitext(video_id)[1][1:]
1327         video_id = os.path.splitext(video_id)[0]
1328
1329         # it's tempting to parse this further, but you would
1330         # have to take into account all the variations like
1331         #   Video Title - Site Name
1332         #   Site Name | Video Title
1333         #   Video Title - Tagline | Site Name
1334         # and so on and so forth; it's just not practical
1335         mobj = re.search(r'<title>(.*)</title>', webpage)
1336         if mobj is None:
1337             self._downloader.trouble(u'ERROR: unable to extract title')
1338             return
1339         video_title = mobj.group(1)
1340
1341         # video uploader is domain name
1342         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1343         if mobj is None:
1344             self._downloader.trouble(u'ERROR: unable to extract title')
1345             return
1346         video_uploader = mobj.group(1)
1347
1348         return [{
1349             'id':       video_id,
1350             'url':      video_url,
1351             'uploader': video_uploader,
1352             'upload_date':  None,
1353             'title':    video_title,
1354             'ext':      video_extension,
1355         }]
1356
1357
1358 class YoutubeSearchIE(InfoExtractor):
1359     """Information Extractor for YouTube search queries."""
1360     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1361     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1362     _max_youtube_results = 1000
1363     IE_NAME = u'youtube:search'
1364
1365     def __init__(self, downloader=None):
1366         InfoExtractor.__init__(self, downloader)
1367
1368     def report_download_page(self, query, pagenum):
1369         """Report attempt to download search page with given number."""
1370         query = query.decode(preferredencoding())
1371         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1372
1373     def _real_extract(self, query):
1374         mobj = re.match(self._VALID_URL, query)
1375         if mobj is None:
1376             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1377             return
1378
1379         prefix, query = query.split(':')
1380         prefix = prefix[8:]
1381         query = query.encode('utf-8')
1382         if prefix == '':
1383             self._download_n_results(query, 1)
1384             return
1385         elif prefix == 'all':
1386             self._download_n_results(query, self._max_youtube_results)
1387             return
1388         else:
1389             try:
1390                 n = int(prefix)
1391                 if n <= 0:
1392                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1393                     return
1394                 elif n > self._max_youtube_results:
1395                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1396                     n = self._max_youtube_results
1397                 self._download_n_results(query, n)
1398                 return
1399             except ValueError: # parsing prefix as integer fails
1400                 self._download_n_results(query, 1)
1401                 return
1402
1403     def _download_n_results(self, query, n):
1404         """Downloads a specified number of results for a query"""
1405
1406         video_ids = []
1407         pagenum = 0
1408         limit = n
1409
1410         while (50 * pagenum) < limit:
1411             self.report_download_page(query, pagenum+1)
1412             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1413             request = compat_urllib_request.Request(result_url)
1414             try:
1415                 data = compat_urllib_request.urlopen(request).read()
1416             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1417                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1418                 return
1419             api_response = json.loads(data)['data']
1420
1421             new_ids = list(video['id'] for video in api_response['items'])
1422             video_ids += new_ids
1423
1424             limit = min(n, api_response['totalItems'])
1425             pagenum += 1
1426
1427         if len(video_ids) > n:
1428             video_ids = video_ids[:n]
1429         for id in video_ids:
1430             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1431         return
1432
1433
1434 class GoogleSearchIE(InfoExtractor):
1435     """Information Extractor for Google Video search queries."""
1436     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1437     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1438     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1439     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1440     _max_google_results = 1000
1441     IE_NAME = u'video.google:search'
1442
1443     def __init__(self, downloader=None):
1444         InfoExtractor.__init__(self, downloader)
1445
1446     def report_download_page(self, query, pagenum):
1447         """Report attempt to download playlist page with given number."""
1448         query = query.decode(preferredencoding())
1449         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1450
1451     def _real_extract(self, query):
1452         mobj = re.match(self._VALID_URL, query)
1453         if mobj is None:
1454             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1455             return
1456
1457         prefix, query = query.split(':')
1458         prefix = prefix[8:]
1459         query = query.encode('utf-8')
1460         if prefix == '':
1461             self._download_n_results(query, 1)
1462             return
1463         elif prefix == 'all':
1464             self._download_n_results(query, self._max_google_results)
1465             return
1466         else:
1467             try:
1468                 n = int(prefix)
1469                 if n <= 0:
1470                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1471                     return
1472                 elif n > self._max_google_results:
1473                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1474                     n = self._max_google_results
1475                 self._download_n_results(query, n)
1476                 return
1477             except ValueError: # parsing prefix as integer fails
1478                 self._download_n_results(query, 1)
1479                 return
1480
1481     def _download_n_results(self, query, n):
1482         """Downloads a specified number of results for a query"""
1483
1484         video_ids = []
1485         pagenum = 0
1486
1487         while True:
1488             self.report_download_page(query, pagenum)
1489             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1490             request = compat_urllib_request.Request(result_url)
1491             try:
1492                 page = compat_urllib_request.urlopen(request).read()
1493             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1494                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1495                 return
1496
1497             # Extract video identifiers
1498             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1499                 video_id = mobj.group(1)
1500                 if video_id not in video_ids:
1501                     video_ids.append(video_id)
1502                     if len(video_ids) == n:
1503                         # Specified n videos reached
1504                         for id in video_ids:
1505                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1506                         return
1507
1508             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1509                 for id in video_ids:
1510                     self._downloader#!/usr/bin/env python
1511 # -*- coding: utf-8 -*-
1512
1513 from __future__ import absolute_import
1514
1515 import datetime
1516 import netrc
1517 import os
1518 import re
1519 import socket
1520 import time
1521 import email.utils
1522 import xml.etree.ElementTree
1523 import random
1524 import math
1525
1526 from .utils import *
1527
1528
1529 class InfoExtractor(object):
1530     """Information Extractor class.
1531
1532     Information extractors are the classes that, given a URL, extract
1533     information about the video (or videos) the URL refers to. This
1534     information includes the real video URL, the video title, author and
1535     others. The information is stored in a dictionary which is then
1536     passed to the FileDownloader. The FileDownloader processes this
1537     information possibly downloading the video to the file system, among
1538     other possible outcomes.
1539
1540     The dictionaries must include the following fields:
1541
1542     id:             Video identifier.
1543     url:            Final video URL.
1544     title:          Video title, unescaped.
1545     ext:            Video filename extension.
1546     uploader:       Full name of the video uploader.
1547     upload_date:    Video upload date (YYYYMMDD).
1548
1549     The following fields are optional:
1550
1551     format:         The video format, defaults to ext (used for --get-format)
1552     thumbnail:      Full URL to a video thumbnail image.
1553     description:    One-line video description.
1554     uploader_id:    Nickname or id of the video uploader.
1555     player_url:     SWF Player URL (used for rtmpdump).
1556     subtitles:      The .srt file contents.
1557     urlhandle:      [internal] The urlHandle to be used to download the file,
1558                     like returned by urllib.request.urlopen
1559
1560     The fields should all be Unicode strings.
1561
1562     Subclasses of this one should re-define the _real_initialize() and
1563     _real_extract() methods and define a _VALID_URL regexp.
1564     Probably, they should also be added to the list of extractors.
1565
1566     _real_extract() must return a *list* of information dictionaries as
1567     described above.
1568
1569     Finally, the _WORKING attribute should be set to False for broken IEs
1570     in order to warn the users and skip the tests.
1571     """
1572
1573     _ready = False
1574     _downloader = None
1575     _WORKING = True
1576
1577     def __init__(self, downloader=None):
1578         """Constructor. Receives an optional downloader."""
1579         self._ready = False
1580         self.set_downloader(downloader)
1581
1582     def suitable(self, url):
1583         """Receives a URL and returns True if suitable for this IE."""
1584         return re.match(self._VALID_URL, url) is not None
1585
1586     def working(self):
1587         """Getter method for _WORKING."""
1588         return self._WORKING
1589
1590     def initialize(self):
1591         """Initializes an instance (authentication, etc)."""
1592         if not self._ready:
1593             self._real_initialize()
1594             self._ready = True
1595
1596     def extract(self, url):
1597         """Extracts URL information and returns it in list of dicts."""
1598         self.initialize()
1599         return self._real_extract(url)
1600
1601     def set_downloader(self, downloader):
1602         """Sets the downloader for this IE."""
1603         self._downloader = downloader
1604
1605     def _real_initialize(self):
1606         """Real initialization process. Redefine in subclasses."""
1607         pass
1608
1609     def _real_extract(self, url):
1610         """Real extraction process. Redefine in subclasses."""
1611         pass
1612
1613     @property
1614     def IE_NAME(self):
1615         return type(self).__name__[:-2]
1616
1617 class YoutubeIE(InfoExtractor):
1618     """Information extractor for youtube.com."""
1619
1620     _VALID_URL = r"""^
1621                      (
1622                          (?:https?://)?                                       # http(s):// (optional)
1623                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
1624                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
1625                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
1626                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
1627                          (?:                                                  # the various things that can precede the ID:
1628                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
1629                              |(?:                                             # or the v= param in all its forms
1630                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
1631                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
1632                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
1633                                  v=
1634                              )
1635                          )?                                                   # optional -> youtube.com/xxxx is OK
1636                      )?                                                       # all until now is optional -> you can pass the naked ID
1637                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
1638                      (?(1).+)?                                                # if we found the ID, everything can follow
1639                      $"""
1640     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1641     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1642     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1643     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
1644     _NETRC_MACHINE = 'youtube'
1645     # Listed in order of quality
1646     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1647     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1648     _video_extensions = {
1649         '13': '3gp',
1650         '17': 'mp4',
1651         '18': 'mp4',
1652         '22': 'mp4',
1653         '37': 'mp4',
1654         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1655         '43': 'webm',
1656         '44': 'webm',
1657         '45': 'webm',
1658         '46': 'webm',
1659     }
1660     _video_dimensions = {
1661         '5': '240x400',
1662         '6': '???',
1663         '13': '???',
1664         '17': '144x176',
1665         '18': '360x640',
1666         '22': '720x1280',
1667         '34': '360x640',
1668         '35': '480x854',
1669         '37': '1080x1920',
1670         '38': '3072x4096',
1671         '43': '360x640',
1672         '44': '480x854',
1673         '45': '720x1280',
1674         '46': '1080x1920',
1675     }
1676     IE_NAME = u'youtube'
1677
1678     def suitable(self, url):
1679         """Receives a URL and returns True if suitable for this IE."""
1680         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
1681
1682     def report_lang(self):
1683         """Report attempt to set language."""
1684         self._downloader.to_screen(u'[youtube] Setting language')
1685
1686     def report_login(self):
1687         """Report attempt to log in."""
1688         self._downloader.to_screen(u'[youtube] Logging in')
1689
1690     def report_age_confirmation(self):
1691         """Report attempt to confirm age."""
1692         self._downloader.to_screen(u'[youtube] Confirming age')
1693
1694     def report_video_webpage_download(self, video_id):
1695         """Report attempt to download video webpage."""
1696         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1697
1698     def report_video_info_webpage_download(self, video_id):
1699         """Report attempt to download video info webpage."""
1700         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1701
1702     def report_video_subtitles_download(self, video_id):
1703         """Report attempt to download video info webpage."""
1704         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1705
1706     def report_information_extraction(self, video_id):
1707         """Report attempt to extract video information."""
1708         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1709
1710     def report_unavailable_format(self, video_id, format):
1711         """Report extracted video URL."""
1712         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1713
1714     def report_rtmp_download(self):
1715         """Indicate the download will use the RTMP protocol."""
1716         self._downloader.to_screen(u'[youtube] RTMP download detected')
1717
1718     def _closed_captions_xml_to_srt(self, xml_string):
1719         srt = ''
1720         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1721         # TODO parse xml instead of regex
1722         for n, (start, dur_tag, dur, caption) in enumerate(texts):
1723             if not dur: dur = '4'
1724             start = float(start)
1725             end = start + float(dur)
1726             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1727             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1728             caption = unescapeHTML(caption)
1729             caption = unescapeHTML(caption) # double cycle, intentional
1730             srt += str(n+1) + '\n'
1731             srt += start + ' --> ' + end + '\n'
1732             srt += caption + '\n\n'
1733         return srt
1734
1735     def _extract_subtitles(self, video_id):
1736         self.report_video_subtitles_download(video_id)
1737         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1738         try:
1739             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
1740         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1741             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
1742         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
1743         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
1744         if not srt_lang_list:
1745             return (u'WARNING: video has no closed captions', None)
1746         if self._downloader.params.get('subtitleslang', False):
1747             srt_lang = self._downloader.params.get('subtitleslang')
1748         elif 'en' in srt_lang_list:
1749             srt_lang = 'en'
1750         else:
1751             srt_lang = list(srt_lang_list.keys())[0]
1752         if not srt_lang in srt_lang_list:
1753             return (u'WARNING: no closed captions found in the specified language', None)
1754         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
1755         try:
1756             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
1757         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1758             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
1759         if not srt_xml:
1760             return (u'WARNING: unable to download video subtitles', None)
1761         return (None, self._closed_captions_xml_to_srt(srt_xml))
1762
1763     def _print_formats(self, formats):
1764         print('Available formats:')
1765         for x in formats:
1766             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
1767
1768     def _real_initialize(self):
1769         if self._downloader is None:
1770             return
1771
1772         username = None
1773         password = None
1774         downloader_params = self._downloader.params
1775
1776         # Attempt to use provided username and password or .netrc data
1777         if downloader_params.get('username', None) is not None:
1778             username = downloader_params['username']
1779             password = downloader_params['password']
1780         elif downloader_params.get('usenetrc', False):
1781             try:
1782                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1783                 if info is not None:
1784                     username = info[0]
1785                     password = info[2]
1786                 else:
1787                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1788             except (IOError, netrc.NetrcParseError) as err:
1789                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
1790                 return
1791
1792         # Set language
1793         request = compat_urllib_request.Request(self._LANG_URL)
1794         try:
1795             self.report_lang()
1796             compat_urllib_request.urlopen(request).read()
1797         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1798             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
1799             return
1800
1801         # No authentication to be performed
1802         if username is None:
1803             return
1804
1805         # Log in
1806         login_form = {
1807                 'current_form': 'loginForm',
1808                 'next':     '/',
1809                 'action_login': 'Log In',
1810                 'username': username,
1811                 'password': password,
1812                 }
1813         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1814         try:
1815             self.report_login()
1816             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
1817             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1818                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1819                 return
1820         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1821             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
1822             return
1823
1824         # Confirm age
1825         age_form = {
1826                 'next_url':     '/',
1827                 'action_confirm':   'Confirm',
1828                 }
1829         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
1830         try:
1831             self.report_age_confirmation()
1832             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
1833         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1834             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
1835             return
1836
1837     def _extract_id(self, url):
1838         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1839         if mobj is None:
1840             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1841             return
1842         video_id = mobj.group(2)
1843         return video_id
1844
1845     def _real_extract(self, url):
1846         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1847         mobj = re.search(self._NEXT_URL_RE, url)
1848         if mobj:
1849             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1850         video_id = self._extract_id(url)
1851
1852         # Get video webpage
1853         self.report_video_webpage_download(video_id)
1854         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1855         request = compat_urllib_request.Request(url)
1856         try:
1857             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1858         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1859             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
1860             return
1861
1862         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1863
1864         # Attempt to extract SWF player URL
1865         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1866         if mobj is not None:
1867             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1868         else:
1869             player_url = None
1870
1871         # Get video info
1872         self.report_video_info_webpage_download(video_id)
1873         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1874             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1875                     % (video_id, el_type))
1876             request = compat_urllib_request.Request(video_info_url)
1877             try:
1878                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
1879                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
1880                 video_info = compat_parse_qs(video_info_webpage)
1881                 if 'token' in video_info:
1882                     break
1883             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1884                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1885                 return
1886         if 'token' not in video_info:
1887             if 'reason' in video_info:
1888                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
1889             else:
1890                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1891             return
1892
1893         # Check for "rental" videos
1894         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1895             self._downloader.trouble(u'ERROR: "rental" videos not supported')
1896             return
1897
1898         # Start extracting information
1899         self.report_information_extraction(video_id)
1900
1901         # uploader
1902         if 'author' not in video_info:
1903             self._downloader.trouble(u'ERROR: unable to extract uploader name')
1904             return
1905         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1906
1907         # uploader_id
1908         video_uploader_id = None
1909         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
1910         if mobj is not None:
1911             video_uploader_id = mobj.group(1)
1912         else:
1913             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
1914
1915         # title
1916         if 'title' not in video_info:
1917             self._downloader.trouble(u'ERROR: unable to extract video title')
1918             return
1919         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1920
1921         # thumbnail image
1922         if 'thumbnail_url' not in video_info:
1923             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1924             video_thumbnail = ''
1925         else:   # don't panic if we can't find it
1926             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1927
1928         # upload date
1929         upload_date = None
1930         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1931         if mobj is not None:
1932             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1933             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1934             for expression in format_expressions:
1935                 try:
1936                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1937                 except:
1938                     pass
1939
1940         # description
1941         video_description = get_element_by_id("eow-description", video_webpage)
1942         if video_description:
1943             video_description = clean_html(video_description)
1944         else:
1945             video_description = ''
1946
1947         # closed captions
1948         video_subtitles = None
1949         if self._downloader.params.get('writesubtitles', False):
1950             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
1951             if srt_error:
1952                 self._downloader.trouble(srt_error)
1953
1954         if 'length_seconds' not in video_info:
1955             self._downloader.trouble(u'WARNING: unable to extract video duration')
1956             video_duration = ''
1957         else:
1958             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1959
1960         # token
1961         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
1962
1963         # Decide which formats to download
1964         req_format = self._downloader.params.get('format', None)
1965
1966         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1967             self.report_rtmp_download()
1968             video_url_list = [(None, video_info['conn'][0])]
1969         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1970             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1971             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
1972             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
1973             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
1974
1975             format_limit = self._downloader.params.get('format_limit', None)
1976             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1977             if format_limit is not None and format_limit in available_formats:
1978                 format_list = available_formats[available_formats.index(format_limit):]
1979             else:
1980                 format_list = available_formats
1981             existing_formats = [x for x in format_list if x in url_map]
1982             if len(existing_formats) == 0:
1983                 self._downloader.trouble(u'ERROR: no known formats available for video')
1984                 return
1985             if self._downloader.params.get('listformats', None):
1986                 self._print_formats(existing_formats)
1987                 return
1988             if req_format is None or req_format == 'best':
1989                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1990             elif req_format == 'worst':
1991                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1992             elif req_format in ('-1', 'all'):
1993                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1994             else:
1995                 # Specific formats. We pick the first in a slash-delimeted sequence.
1996                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1997                 req_formats = req_format.split('/')
1998                 video_url_list = None
1999                 for rf in req_formats:
2000                     if rf in url_map:
2001                         video_url_list = [(rf, url_map[rf])]
2002                         break
2003                 if video_url_list is None:
2004                     self._downloader.trouble(u'ERROR: requested format not available')
2005                     return
2006         else:
2007             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
2008             return
2009
2010         results = []
2011         for format_param, video_real_url in video_url_list:
2012             # Extension
2013             video_extension = self._video_extensions.get(format_param, 'flv')
2014
2015             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
2016                                               self._video_dimensions.get(format_param, '???'))
2017
2018             results.append({
2019                 'id':       video_id,
2020                 'url':      video_real_url,
2021                 'uploader': video_uploader,
2022                 'uploader_id': video_uploader_id,
2023                 'upload_date':  upload_date,
2024                 'title':    video_title,
2025                 'ext':      video_extension,
2026                 'format':   video_format,
2027                 'thumbnail':    video_thumbnail,
2028                 'description':  video_description,
2029                 'player_url':   player_url,
2030                 'subtitles':    video_subtitles,
2031                 'duration':     video_duration
2032             })
2033         return results
2034
2035
2036 class MetacafeIE(InfoExtractor):
2037     """Information Extractor for metacafe.com."""
2038
2039     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
2040     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
2041     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
2042     IE_NAME = u'metacafe'
2043
2044     def __init__(self, downloader=None):
2045         InfoExtractor.__init__(self, downloader)
2046
2047     def report_disclaimer(self):
2048         """Report disclaimer retrieval."""
2049         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
2050
2051     def report_age_confirmation(self):
2052         """Report attempt to confirm age."""
2053         self._downloader.to_screen(u'[metacafe] Confirming age')
2054
2055     def report_download_webpage(self, video_id):
2056         """Report webpage download."""
2057         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
2058
2059     def report_extraction(self, video_id):
2060         """Report information extraction."""
2061         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
2062
2063     def _real_initialize(self):
2064         # Retrieve disclaimer
2065         request = compat_urllib_request.Request(self._DISCLAIMER)
2066         try:
2067             self.report_disclaimer()
2068             disclaimer = compat_urllib_request.urlopen(request).read()
2069         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2070             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
2071             return
2072
2073         # Confirm age
2074         disclaimer_form = {
2075             'filters': '0',
2076             'submit': "Continue - I'm over 18",
2077             }
2078         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
2079         try:
2080             self.report_age_confirmation()
2081             disclaimer = compat_urllib_request.urlopen(request).read()
2082         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2083             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
2084             return
2085
2086     def _real_extract(self, url):
2087         # Extract id and simplified title from URL
2088         mobj = re.match(self._VALID_URL, url)
2089         if mobj is None:
2090             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2091             return
2092
2093         video_id = mobj.group(1)
2094
2095         # Check if video comes from YouTube
2096         mobj2 = re.match(r'^yt-(.*)$', video_id)
2097         if mobj2 is not None:
2098             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
2099             return
2100
2101         # Retrieve video webpage to extract further information
2102         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
2103         try:
2104             self.report_download_webpage(video_id)
2105             webpage = compat_urllib_request.urlopen(request).read()
2106         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2107             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
2108             return
2109
2110         # Extract URL, uploader and title from webpage
2111         self.report_extraction(video_id)
2112         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
2113         if mobj is not None:
2114             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
2115             video_extension = mediaURL[-3:]
2116
2117             # Extract gdaKey if available
2118             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
2119             if mobj is None:
2120                 video_url = mediaURL
2121             else:
2122                 gdaKey = mobj.group(1)
2123                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
2124         else:
2125             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
2126             if mobj is None:
2127                 self._downloader.trouble(u'ERROR: unable to extract media URL')
2128                 return
2129             vardict = compat_parse_qs(mobj.group(1))
2130             if 'mediaData' not in vardict:
2131                 self._downloader.trouble(u'ERROR: unable to extract media URL')
2132                 return
2133             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
2134             if mobj is None:
2135                 self._downloader.trouble(u'ERROR: unable to extract media URL')
2136                 return
2137             mediaURL = mobj.group(1).replace('\\/', '/')
2138             video_extension = mediaURL[-3:]
2139             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
2140
2141         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
2142         if mobj is None:
2143             self._downloader.trouble(u'ERROR: unable to extract title')
2144             return
2145         video_title = mobj.group(1).decode('utf-8')
2146
2147         mobj = re.search(r'submitter=(.*?);', webpage)
2148         if mobj is None:
2149             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2150             return
2151         video_uploader = mobj.group(1)
2152
2153         return [{
2154             'id':       video_id.decode('utf-8'),
2155             'url':      video_url.decode('utf-8'),
2156             'uploader': video_uploader.decode('utf-8'),
2157             'upload_date':  None,
2158             'title':    video_title,
2159             'ext':      video_extension.decode('utf-8'),
2160         }]
2161
2162
2163 class DailymotionIE(InfoExtractor):
2164     """Information Extractor for Dailymotion"""
2165
2166     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
2167     IE_NAME = u'dailymotion'
2168
2169     def __init__(self, downloader=None):
2170         InfoExtractor.__init__(self, downloader)
2171
2172     def report_download_webpage(self, video_id):
2173         """Report webpage download."""
2174         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
2175
2176     def report_extraction(self, video_id):
2177         """Report information extraction."""
2178         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
2179
2180     def _real_extract(self, url):
2181         # Extract id and simplified title from URL
2182         mobj = re.match(self._VALID_URL, url)
2183         if mobj is None:
2184             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2185             return
2186
2187         video_id = mobj.group(1).split('_')[0].split('?')[0]
2188
2189         video_extension = 'mp4'
2190
2191         # Retrieve video webpage to extract further information
2192         request = compat_urllib_request.Request(url)
2193         request.add_header('Cookie', 'family_filter=off')
2194         try:
2195             self.report_download_webpage(video_id)
2196             webpage_bytes = compat_urllib_request.urlopen(request).read()
2197             webpage = webpage_bytes.decode('utf-8')
2198         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2199             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
2200             return
2201
2202         # Extract URL, uploader and title from webpage
2203         self.report_extraction(video_id)
2204         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
2205         if mobj is None:
2206             self._downloader.trouble(u'ERROR: unable to extract media URL')
2207             return
2208         flashvars = compat_urllib_parse.unquote(mobj.group(1))
2209
2210         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
2211             if key in flashvars:
2212                 max_quality = key
2213                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
2214                 break
2215         else:
2216             self._downloader.trouble(u'ERROR: unable to extract video URL')
2217             return
2218
2219         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
2220         if mobj is None:
2221             self._downloader.trouble(u'ERROR: unable to extract video URL')
2222             return
2223
2224         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
2225
2226         # TODO: support choosing qualities
2227
2228         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
2229         if mobj is None:
2230             self._downloader.trouble(u'ERROR: unable to extract title')
2231             return
2232         video_title = unescapeHTML(mobj.group('title'))
2233
2234         video_uploader = None
2235         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
2236         if mobj is None:
2237             # lookin for official user
2238             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
2239             if mobj_official is None:
2240                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
2241             else:
2242                 video_uploader = mobj_official.group(1)
2243         else:
2244             video_uploader = mobj.group(1)
2245
2246         video_upload_date = None
2247         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
2248         if mobj is not None:
2249             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
2250
2251         return [{
2252             'id':       video_id,
2253             'url':      video_url,
2254             'uploader': video_uploader,
2255             'upload_date':  video_upload_date,
2256             'title':    video_title,
2257             'ext':      video_extension,
2258         }]
2259
2260
2261 class PhotobucketIE(InfoExtractor):
2262     """Information extractor for photobucket.com."""
2263
2264     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
2265     IE_NAME = u'photobucket'
2266
2267     def __init__(self, downloader=None):
2268         InfoExtractor.__init__(self, downloader)
2269
2270     def report_download_webpage(self, video_id):
2271         """Report webpage download."""
2272         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
2273
2274     def report_extraction(self, video_id):
2275         """Report information extraction."""
2276         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
2277
2278     def _real_extract(self, url):
2279         # Extract id from URL
2280         mobj = re.match(self._VALID_URL, url)
2281         if mobj is None:
2282             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2283             return
2284
2285         video_id = mobj.group(1)
2286
2287         video_extension = 'flv'
2288
2289         # Retrieve video webpage to extract further information
2290         request = compat_urllib_request.Request(url)
2291         try:
2292             self.report_download_webpage(video_id)
2293             webpage = compat_urllib_request.urlopen(request).read()
2294         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2295             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2296             return
2297
2298         # Extract URL, uploader, and title from webpage
2299         self.report_extraction(video_id)
2300         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
2301         if mobj is None:
2302             self._downloader.trouble(u'ERROR: unable to extract media URL')
2303             return
2304         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
2305
2306         video_url = mediaURL
2307
2308         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
2309         if mobj is None:
2310             self._downloader.trouble(u'ERROR: unable to extract title')
2311             return
2312         video_title = mobj.group(1).decode('utf-8')
2313
2314         video_uploader = mobj.group(2).decode('utf-8')
2315
2316         return [{
2317             'id':       video_id.decode('utf-8'),
2318             'url':      video_url.decode('utf-8'),
2319             'uploader': video_uploader,
2320             'upload_date':  None,
2321             'title':    video_title,
2322             'ext':      video_extension.decode('utf-8'),
2323         }]
2324
2325
2326 class YahooIE(InfoExtractor):
2327     """Information extractor for video.yahoo.com."""
2328
2329     _WORKING = False
2330     # _VALID_URL matches all Yahoo! Video URLs
2331     # _VPAGE_URL matches only the extractable '/watch/' URLs
2332     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
2333     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
2334     IE_NAME = u'video.yahoo'
2335
2336     def __init__(self, downloader=None):
2337         InfoExtractor.__init__(self, downloader)
2338
2339     def report_download_webpage(self, video_id):
2340         """Report webpage download."""
2341         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
2342
2343     def report_extraction(self, video_id):
2344         """Report information extraction."""
2345         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
2346
2347     def _real_extract(self, url, new_video=True):
2348         # Extract ID from URL
2349         mobj = re.match(self._VALID_URL, url)
2350         if mobj is None:
2351             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2352             return
2353
2354         video_id = mobj.group(2)
2355         video_extension = 'flv'
2356
2357         # Rewrite valid but non-extractable URLs as
2358         # extractable English language /watch/ URLs
2359         if re.match(self._VPAGE_URL, url) is None:
2360             request = compat_urllib_request.Request(url)
2361             try:
2362                 webpage = compat_urllib_request.urlopen(request).read()
2363             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2364                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2365                 return
2366
2367             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2368             if mobj is None:
2369                 self._downloader.trouble(u'ERROR: Unable to extract id field')
2370                 return
2371             yahoo_id = mobj.group(1)
2372
2373             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2374             if mobj is None:
2375                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2376                 return
2377             yahoo_vid = mobj.group(1)
2378
2379             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2380             return self._real_extract(url, new_video=False)
2381
2382         # Retrieve video webpage to extract further information
2383         request = compat_urllib_request.Request(url)
2384         try:
2385             self.report_download_webpage(video_id)
2386             webpage = compat_urllib_request.urlopen(request).read()
2387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2388             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2389             return
2390
2391         # Extract uploader and title from webpage
2392         self.report_extraction(video_id)
2393         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2394         if mobj is None:
2395             self._downloader.trouble(u'ERROR: unable to extract video title')
2396             return
2397         video_title = mobj.group(1).decode('utf-8')
2398
2399         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2400         if mobj is None:
2401             self._downloader.trouble(u'ERROR: unable to extract video uploader')
2402             return
2403         video_uploader = mobj.group(1).decode('utf-8')
2404
2405         # Extract video thumbnail
2406         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2407         if mobj is None:
2408             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2409             return
2410         video_thumbnail = mobj.group(1).decode('utf-8')
2411
2412         # Extract video description
2413         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2414         if mobj is None:
2415             self._downloader.trouble(u'ERROR: unable to extract video description')
2416             return
2417         video_description = mobj.group(1).decode('utf-8')
2418         if not video_description:
2419             video_description = 'No description available.'
2420
2421         # Extract video height and width
2422         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2423         if mobj is None:
2424             self._downloader.trouble(u'ERROR: unable to extract video height')
2425             return
2426         yv_video_height = mobj.group(1)
2427
2428         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2429         if mobj is None:
2430             self._downloader.trouble(u'ERROR: unable to extract video width')
2431             return
2432         yv_video_width = mobj.group(1)
2433
2434         # Retrieve video playlist to extract media URL
2435         # I'm not completely sure what all these options are, but we
2436         # seem to need most of them, otherwise the server sends a 401.
2437         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
2438         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
2439         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2440                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2441                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2442         try:
2443             self.report_download_webpage(video_id)
2444             webpage = compat_urllib_request.urlopen(request).read()
2445         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2446             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2447             return
2448
2449         # Extract media URL from playlist XML
2450         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2451         if mobj is None:
2452             self._downloader.trouble(u'ERROR: Unable to extract media URL')
2453             return
2454         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2455         video_url = unescapeHTML(video_url)
2456
2457         return [{
2458             'id':       video_id.decode('utf-8'),
2459             'url':      video_url,
2460             'uploader': video_uploader,
2461             'upload_date':  None,
2462             'title':    video_title,
2463             'ext':      video_extension.decode('utf-8'),
2464             'thumbnail':    video_thumbnail.decode('utf-8'),
2465             'description':  video_description,
2466         }]
2467
2468
2469 class VimeoIE(InfoExtractor):
2470     """Information extractor for vimeo.com."""
2471
2472     # _VALID_URL matches Vimeo URLs
2473     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
2474     IE_NAME = u'vimeo'
2475
2476     def __init__(self, downloader=None):
2477         InfoExtractor.__init__(self, downloader)
2478
2479     def report_download_webpage(self, video_id):
2480         """Report webpage download."""
2481         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2482
2483     def report_extraction(self, video_id):
2484         """Report information extraction."""
2485         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2486
2487     def _real_extract(self, url, new_video=True):
2488         # Extract ID from URL
2489         mobj = re.match(self._VALID_URL, url)
2490         if mobj is None:
2491             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2492             return
2493
2494         video_id = mobj.group(1)
2495
2496         # Retrieve video webpage to extract further information
2497         request = compat_urllib_request.Request(url, None, std_headers)
2498         try:
2499             self.report_download_webpage(video_id)
2500             webpage_bytes = compat_urllib_request.urlopen(request).read()
2501             webpage = webpage_bytes.decode('utf-8')
2502         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2503             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2504             return
2505
2506         # Now we begin extracting as much information as we can from what we
2507         # retrieved. First we extract the information common to all extractors,
2508         # and latter we extract those that are Vimeo specific.
2509         self.report_extraction(video_id)
2510
2511         # Extract the config JSON
2512         try:
2513             config = webpage.split(' = {config:')[1].split(',assets:')[0]
2514             config = json.loads(config)
2515         except:
2516             self._downloader.trouble(u'ERROR: unable to extract info section')
2517             return
2518
2519         # Extract title
2520         video_title = config["video"]["title"]
2521
2522         # Extract uploader and uploader_id
2523         video_uploader = config["video"]["owner"]["name"]
2524         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
2525
2526         # Extract video thumbnail
2527         video_thumbnail = config["video"]["thumbnail"]
2528
2529         # Extract video description
2530         video_description = get_element_by_attribute("itemprop", "description", webpage)
2531         if video_description: video_description = clean_html(video_description)
2532         else: video_description = ''
2533
2534         # Extract upload date
2535         video_upload_date = None
2536         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
2537         if mobj is not None:
2538             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
2539
2540         # Vimeo specific: extract request signature and timestamp
2541         sig = config['request']['signature']
2542         timestamp = config['request']['timestamp']
2543
2544         # Vimeo specific: extract video codec and quality information
2545         # First consider quality, then codecs, then take everything
2546         # TODO bind to format param
2547         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2548         files = { 'hd': [], 'sd': [], 'other': []}
2549         for codec_name, codec_extension in codecs:
2550             if codec_name in config["video"]["files"]:
2551                 if 'hd' in config["video"]["files"][codec_name]:
2552                     files['hd'].append((codec_name, codec_extension, 'hd'))
2553                 elif 'sd' in config["video"]["files"][codec_name]:
2554                     files['sd'].append((codec_name, codec_extension, 'sd'))
2555                 else:
2556                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
2557
2558         for quality in ('hd', 'sd', 'other'):
2559             if len(files[quality]) > 0:
2560                 video_quality = files[quality][0][2]
2561                 video_codec = files[quality][0][0]
2562                 video_extension = files[quality][0][1]
2563                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
2564                 break
2565         else:
2566             self._downloader.trouble(u'ERROR: no known codec found')
2567             return
2568
2569         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2570                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
2571
2572         return [{
2573             'id':       video_id,
2574             'url':      video_url,
2575             'uploader': video_uploader,
2576             'uploader_id': video_uploader_id,
2577             'upload_date':  video_upload_date,
2578             'title':    video_title,
2579             'ext':      video_extension,
2580             'thumbnail':    video_thumbnail,
2581             'description':  video_description,
2582         }]
2583
2584
2585 class ArteTvIE(InfoExtractor):
2586     """arte.tv information extractor."""
2587
2588     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
2589     _LIVE_URL = r'index-[0-9]+\.html$'
2590
2591     IE_NAME = u'arte.tv'
2592
2593     def __init__(self, downloader=None):
2594         InfoExtractor.__init__(self, downloader)
2595
2596     def report_download_webpage(self, video_id):
2597         """Report webpage download."""
2598         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
2599
2600     def report_extraction(self, video_id):
2601         """Report information extraction."""
2602         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
2603
2604     def fetch_webpage(self, url):
2605         self._downloader.incre#!/usr/bin/env python
2606 # -*- coding: utf-8 -*-
2607
2608 from __future__ import absolute_import
2609
2610 import datetime
2611 import netrc
2612 import os
2613 import re
2614 import socket
2615 import time
2616 import email.utils
2617 import xml.etree.ElementTree
2618 import random
2619 import math
2620
2621 from .utils import *
2622
2623
2624 class InfoExtractor(object):
2625     """Information Extractor class.
2626
2627     Information extractors are the classes that, given a URL, extract
2628     information about the video (or videos) the URL refers to. This
2629     information includes the real video URL, the video title, author and
2630     others. The information is stored in a dictionary which is then
2631     passed to the FileDownloader. The FileDownloader processes this
2632     information possibly downloading the video to the file system, among
2633     other possible outcomes.
2634
2635     The dictionaries must include the following fields:
2636
2637     id:             Video identifier.
2638     url:            Final video URL.
2639     title:          Video title, unescaped.
2640     ext:            Video filename extension.
2641     uploader:       Full name of the video uploader.
2642     upload_date:    Video upload date (YYYYMMDD).
2643
2644     The following fields are optional:
2645
2646     format:         The video format, defaults to ext (used for --get-format)
2647     thumbnail:      Full URL to a video thumbnail image.
2648     description:    One-line video description.
2649     uploader_id:    Nickname or id of the video uploader.
2650     player_url:     SWF Player URL (used for rtmpdump).
2651     subtitles:      The .srt file contents.
2652     urlhandle:      [internal] The urlHandle to be used to download the file,
2653                     like returned by urllib.request.urlopen
2654
2655     The fields should all be Unicode strings.
2656
2657     Subclasses of this one should re-define the _real_initialize() and
2658     _real_extract() methods and define a _VALID_URL regexp.
2659     Probably, they should also be added to the list of extractors.
2660
2661     _real_extract() must return a *list* of information dictionaries as
2662     described above.
2663
2664     Finally, the _WORKING attribute should be set to False for broken IEs
2665     in order to warn the users and skip the tests.
2666     """
2667
2668     _ready = False
2669     _downloader = None
2670     _WORKING = True
2671
2672     def __init__(self, downloader=None):
2673         """Constructor. Receives an optional downloader."""
2674         self._ready = False
2675         self.set_downloader(downloader)
2676
2677     def suitable(self, url):
2678         """Receives a URL and returns True if suitable for this IE."""
2679         return re.match(self._VALID_URL, url) is not None
2680
2681     def working(self):
2682         """Getter method for _WORKING."""
2683         return self._WORKING
2684
2685     def initialize(self):
2686         """Initializes an instance (authentication, etc)."""
2687         if not self._ready:
2688             self._real_initialize()
2689             self._ready = True
2690
2691     def extract(self, url):
2692         """Extracts URL information and returns it in list of dicts."""
2693         self.initialize()
2694         return self._real_extract(url)
2695
2696     def set_downloader(self, downloader):
2697         """Sets the downloader for this IE."""
2698         self._downloader = downloader
2699
2700     def _real_initialize(self):
2701         """Real initialization process. Redefine in subclasses."""
2702         pass
2703
2704     def _real_extract(self, url):
2705         """Real extraction process. Redefine in subclasses."""
2706         pass
2707
2708     @property
2709     def IE_NAME(self):
2710         return type(self).__name__[:-2]
2711
2712 class YoutubeIE(InfoExtractor):
2713     """Information extractor for youtube.com."""
2714
2715     _VALID_URL = r"""^
2716                      (
2717                          (?:https?://)?                                       # http(s):// (optional)
2718                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
2719                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
2720                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
2721                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
2722                          (?:                                                  # the various things that can precede the ID:
2723                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
2724                              |(?:                                             # or the v= param in all its forms
2725                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
2726                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
2727                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
2728                                  v=
2729                              )
2730                          )?                                                   # optional -> youtube.com/xxxx is OK
2731                      )?                                                       # all until now is optional -> you can pass the naked ID
2732                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
2733                      (?(1).+)?                                                # if we found the ID, everything can follow
2734                      $"""
2735     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
2736     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
2737     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
2738     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2739     _NETRC_MACHINE = 'youtube'
2740     # Listed in order of quality
2741     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
2742     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
2743     _video_extensions = {
2744         '13': '3gp',
2745         '17': 'mp4',
2746         '18': 'mp4',
2747         '22': 'mp4',
2748         '37': 'mp4',
2749         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
2750         '43': 'webm',
2751         '44': 'webm',
2752         '45': 'webm',
2753         '46': 'webm',
2754     }
2755     _video_dimensions = {
2756         '5': '240x400',
2757         '6': '???',
2758         '13': '???',
2759         '17': '144x176',
2760         '18': '360x640',
2761         '22': '720x1280',
2762         '34': '360x640',
2763         '35': '480x854',
2764         '37': '1080x1920',
2765         '38': '3072x4096',
2766         '43': '360x640',
2767         '44': '480x854',
2768         '45': '720x1280',
2769         '46': '1080x1920',
2770     }
2771     IE_NAME = u'youtube'
2772
2773     def suitable(self, url):
2774         """Receives a URL and returns True if suitable for this IE."""
2775         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2776
2777     def report_lang(self):
2778         """Report attempt to set language."""
2779         self._downloader.to_screen(u'[youtube] Setting language')
2780
2781     def report_login(self):
2782         """Report attempt to log in."""
2783         self._downloader.to_screen(u'[youtube] Logging in')
2784
2785     def report_age_confirmation(self):
2786         """Report attempt to confirm age."""
2787         self._downloader.to_screen(u'[youtube] Confirming age')
2788
2789     def report_video_webpage_download(self, video_id):
2790         """Report attempt to download video webpage."""
2791         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
2792
2793     def report_video_info_webpage_download(self, video_id):
2794         """Report attempt to download video info webpage."""
2795         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
2796
2797     def report_video_subtitles_download(self, video_id):
2798         """Report attempt to download video info webpage."""
2799         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
2800
2801     def report_information_extraction(self, video_id):
2802         """Report attempt to extract video information."""
2803         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
2804
2805     def report_unavailable_format(self, video_id, format):
2806         """Report extracted video URL."""
2807         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
2808
2809     def report_rtmp_download(self):
2810         """Indicate the download will use the RTMP protocol."""
2811         self._downloader.to_screen(u'[youtube] RTMP download detected')
2812
2813     def _closed_captions_xml_to_srt(self, xml_string):
2814         srt = ''
2815         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
2816         # TODO parse xml instead of regex
2817         for n, (start, dur_tag, dur, caption) in enumerate(texts):
2818             if not dur: dur = '4'
2819             start = float(start)
2820             end = start + float(dur)
2821             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
2822             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
2823             caption = unescapeHTML(caption)
2824             caption = unescapeHTML(caption) # double cycle, intentional
2825             srt += str(n+1) + '\n'
2826             srt += start + ' --> ' + end + '\n'
2827             srt += caption + '\n\n'
2828         return srt
2829
2830     def _extract_subtitles(self, video_id):
2831         self.report_video_subtitles_download(video_id)
2832         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
2833         try:
2834             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
2835         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2836             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
2837         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
2838         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
2839         if not srt_lang_list:
2840             return (u'WARNING: video has no closed captions', None)
2841         if self._downloader.params.get('subtitleslang', False):
2842             srt_lang = self._downloader.params.get('subtitleslang')
2843         elif 'en' in srt_lang_list:
2844             srt_lang = 'en'
2845         else:
2846             srt_lang = list(srt_lang_list.keys())[0]
2847         if not srt_lang in srt_lang_list:
2848             return (u'WARNING: no closed captions found in the specified language', None)
2849         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
2850         try:
2851             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
2852         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2853             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
2854         if not srt_xml:
2855             return (u'WARNING: unable to download video subtitles', None)
2856         return (None, self._closed_captions_xml_to_srt(srt_xml))
2857
2858     def _print_formats(self, formats):
2859         print('Available formats:')
2860         for x in formats:
2861             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
2862
2863     def _real_initialize(self):
2864         if self._downloader is None:
2865             return
2866
2867         username = None
2868         password = None
2869         downloader_params = self._downloader.params
2870
2871         # Attempt to use provided username and password or .netrc data
2872         if downloader_params.get('username', None) is not None:
2873             username = downloader_params['username']
2874             password = downloader_params['password']
2875         elif downloader_params.get('usenetrc', False):
2876             try:
2877                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2878                 if info is not None:
2879                     username = info[0]
2880                     password = info[2]
2881                 else:
2882                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2883             except (IOError, netrc.NetrcParseError) as err:
2884                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2885                 return
2886
2887         # Set language
2888         request = compat_urllib_request.Request(self._LANG_URL)
2889         try:
2890             self.report_lang()
2891             compat_urllib_request.urlopen(request).read()
2892         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2893             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
2894             return
2895
2896         # No authentication to be performed
2897         if username is None:
2898             return
2899
2900         # Log in
2901         login_form = {
2902                 'current_form': 'loginForm',
2903                 'next':     '/',
2904                 'action_login': 'Log In',
2905                 'username': username,
2906                 'password': password,
2907                 }
2908         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2909         try:
2910             self.report_login()
2911             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
2912             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
2913                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
2914                 return
2915         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2916             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2917             return
2918
2919         # Confirm age
2920         age_form = {
2921                 'next_url':     '/',
2922                 'action_confirm':   'Confirm',
2923                 }
2924         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
2925         try:
2926             self.report_age_confirmation()
2927             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
2928         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2929             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
2930             return
2931
2932     def _extract_id(self, url):
2933         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2934         if mobj is None:
2935             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2936             return
2937         video_id = mobj.group(2)
2938         return video_id
2939
2940     def _real_extract(self, url):
2941         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
2942         mobj = re.search(self._NEXT_URL_RE, url)
2943         if mobj:
2944             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
2945         video_id = self._extract_id(url)
2946
2947         # Get video webpage
2948         self.report_video_webpage_download(video_id)
2949         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
2950         request = compat_urllib_request.Request(url)
2951         try:
2952             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
2953         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2954             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2955             return
2956
2957         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
2958
2959         # Attempt to extract SWF player URL
2960         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
2961         if mobj is not None:
2962             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
2963         else:
2964             player_url = None
2965
2966         # Get video info
2967         self.report_video_info_webpage_download(video_id)
2968         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
2969             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
2970                     % (video_id, el_type))
2971             request = compat_urllib_request.Request(video_info_url)
2972             try:
2973                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
2974                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
2975                 video_info = compat_parse_qs(video_info_webpage)
2976                 if 'token' in video_info:
2977                     break
2978             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2979                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2980                 return
2981         if 'token' not in video_info:
2982             if 'reason' in video_info:
2983                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
2984             else:
2985                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
2986             return
2987
2988         # Check for "rental" videos
2989         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
2990             self._downloader.trouble(u'ERROR: "rental" videos not supported')
2991             return
2992
2993         # Start extracting information
2994         self.report_information_extraction(video_id)
2995
2996         # uploader
2997         if 'author' not in video_info:
2998             self._downloader.trouble(u'ERROR: unable to extract uploader name')
2999             return
3000         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
3001
3002         # uploader_id
3003         video_uploader_id = None
3004         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
3005         if mobj is not None:
3006             video_uploader_id = mobj.group(1)
3007         else:
3008             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
3009
3010         # title
3011         if 'title' not in video_info:
3012             self._downloader.trouble(u'ERROR: unable to extract video title')
3013             return
3014         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
3015
3016         # thumbnail image
3017         if 'thumbnail_url' not in video_info:
3018             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
3019             video_thumbnail = ''
3020         else:   # don't panic if we can't find it
3021             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
3022
3023         # upload date
3024         upload_date = None
3025         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
3026         if mobj is not None:
3027             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
3028             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
3029             for expression in format_expressions:
3030                 try:
3031                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
3032                 except:
3033                     pass
3034
3035         # description
3036         video_description = get_element_by_id("eow-description", video_webpage)
3037         if video_description:
3038             video_description = clean_html(video_description)
3039         else:
3040             video_description = ''
3041
3042         # closed captions
3043         video_subtitles = None
3044         if self._downloader.params.get('writesubtitles', False):
3045             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
3046             if srt_error:
3047                 self._downloader.trouble(srt_error)
3048
3049         if 'length_seconds' not in video_info:
3050             self._downloader.trouble(u'WARNING: unable to extract video duration')
3051             video_duration = ''
3052         else:
3053             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
3054
3055         # token
3056         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
3057
3058         # Decide which formats to download
3059         req_format = self._downloader.params.get('format', None)
3060
3061         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
3062             self.report_rtmp_download()
3063             video_url_list = [(None, video_info['conn'][0])]
3064         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
3065             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
3066             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
3067             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
3068             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
3069
3070             format_limit = self._downloader.params.get('format_limit', None)
3071             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
3072             if format_limit is not None and format_limit in available_formats:
3073                 format_list = available_formats[available_formats.index(format_limit):]
3074             else:
3075                 format_list = available_formats
3076             existing_formats = [x for x in format_list if x in url_map]
3077             if len(existing_formats) == 0:
3078                 self._downloader.trouble(u'ERROR: no known formats available for video')
3079                 return
3080             if self._downloader.params.get('listformats', None):
3081                 self._print_formats(existing_formats)
3082                 return
3083             if req_format is None or req_format == 'best':
3084                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3085             elif req_format == 'worst':
3086                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3087             elif req_format in ('-1', 'all'):
3088                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3089             else:
3090                 # Specific formats. We pick the first in a slash-delimeted sequence.
3091                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
3092                 req_formats = req_format.split('/')
3093                 video_url_list = None
3094                 for rf in req_formats:
3095                     if rf in url_map:
3096                         video_url_list = [(rf, url_map[rf])]
3097                         break
3098                 if video_url_list is None:
3099                     self._downloader.trouble(u'ERROR: requested format not available')
3100                     return
3101         else:
3102             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
3103             return
3104
3105         results = []
3106         for format_param, video_real_url in video_url_list:
3107             # Extension
3108             video_extension = self._video_extensions.get(format_param, 'flv')
3109
3110             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
3111                                               self._video_dimensions.get(format_param, '???'))
3112
3113             results.append({
3114                 'id':       video_id,
3115                 'url':      video_real_url,
3116                 'uploader': video_uploader,
3117                 'uploader_id': video_uploader_id,
3118                 'upload_date':  upload_date,
3119                 'title':    video_title,
3120                 'ext':      video_extension,
3121                 'format':   video_format,
3122                 'thumbnail':    video_thumbnail,
3123                 'description':  video_description,
3124                 'player_url':   player_url,
3125                 'subtitles':    video_subtitles,
3126                 'duration':     video_duration
3127             })
3128         return results
3129
3130
3131 class MetacafeIE(InfoExtractor):
3132     """Information Extractor for metacafe.com."""
3133
3134     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
3135     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
3136     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
3137     IE_NAME = u'metacafe'
3138
3139     def __init__(self, downloader=None):
3140         InfoExtractor.__init__(self, downloader)
3141
3142     def report_disclaimer(self):
3143         """Report disclaimer retrieval."""
3144         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
3145
3146     def report_age_confirmation(self):
3147         """Report attempt to confirm age."""
3148         self._downloader.to_screen(u'[metacafe] Confirming age')
3149
3150     def report_download_webpage(self, video_id):
3151         """Report webpage download."""
3152         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
3153
3154     def report_extraction(self, video_id):
3155         """Report information extraction."""
3156         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
3157
3158     def _real_initialize(self):
3159         # Retrieve disclaimer
3160         request = compat_urllib_request.Request(self._DISCLAIMER)
3161         try:
3162             self.report_disclaimer()
3163             disclaimer = compat_urllib_request.urlopen(request).read()
3164         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3165             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
3166             return
3167
3168         # Confirm age
3169         disclaimer_form = {
3170             'filters': '0',
3171             'submit': "Continue - I'm over 18",
3172             }
3173         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
3174         try:
3175             self.report_age_confirmation()
3176             disclaimer = compat_urllib_request.urlopen(request).read()
3177         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3178             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
3179             return
3180
3181     def _real_extract(self, url):
3182         # Extract id and simplified title from URL
3183         mobj = re.match(self._VALID_URL, url)
3184         if mobj is None:
3185             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3186             return
3187
3188         video_id = mobj.group(1)
3189
3190         # Check if video comes from YouTube
3191         mobj2 = re.match(r'^yt-(.*)$', video_id)
3192         if mobj2 is not None:
3193             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
3194             return
3195
3196         # Retrieve video webpage to extract further information
3197         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
3198         try:
3199             self.report_download_webpage(video_id)
3200             webpage = compat_urllib_request.urlopen(request).read()
3201         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3202             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
3203             return
3204
3205         # Extract URL, uploader and title from webpage
3206         self.report_extraction(video_id)
3207         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
3208         if mobj is not None:
3209             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
3210             video_extension = mediaURL[-3:]
3211
3212             # Extract gdaKey if available
3213             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
3214             if mobj is None:
3215                 video_url = mediaURL
3216             else:
3217                 gdaKey = mobj.group(1)
3218                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
3219         else:
3220             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
3221             if mobj is None:
3222                 self._downloader.trouble(u'ERROR: unable to extract media URL')
3223                 return
3224             vardict = compat_parse_qs(mobj.group(1))
3225             if 'mediaData' not in vardict:
3226                 self._downloader.trouble(u'ERROR: unable to extract media URL')
3227                 return
3228             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
3229             if mobj is None:
3230                 self._downloader.trouble(u'ERROR: unable to extract media URL')
3231                 return
3232             mediaURL = mobj.group(1).replace('\\/', '/')
3233             video_extension = mediaURL[-3:]
3234             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
3235
3236         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
3237         if mobj is None:
3238             self._downloader.trouble(u'ERROR: unable to extract title')
3239             return
3240         video_title = mobj.group(1).decode('utf-8')
3241
3242         mobj = re.search(r'submitter=(.*?);', webpage)
3243         if mobj is None:
3244             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
3245             return
3246         video_uploader = mobj.group(1)
3247
3248         return [{
3249             'id':       video_id.decode('utf-8'),
3250             'url':      video_url.decode('utf-8'),
3251             'uploader': video_uploader.decode('utf-8'),
3252             'upload_date':  None,
3253             'title':    video_title,
3254             'ext':      video_extension.decode('utf-8'),
3255         }]
3256
3257
3258 class DailymotionIE(InfoExtractor):
3259     """Information Extractor for Dailymotion"""
3260
3261     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
3262     IE_NAME = u'dailymotion'
3263
3264     def __init__(self, downloader=None):
3265         InfoExtractor.__init__(self, downloader)
3266
3267     def report_download_webpage(self, video_id):
3268         """Report webpage download."""
3269         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
3270
3271     def report_extraction(self, video_id):
3272         """Report information extraction."""
3273         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
3274
3275     def _real_extract(self, url):
3276         # Extract id and simplified title from URL
3277         mobj = re.match(self._VALID_URL, url)
3278         if mobj is None:
3279             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3280             return
3281
3282         video_id = mobj.group(1).split('_')[0].split('?')[0]
3283
3284         video_extension = 'mp4'
3285
3286         # Retrieve video webpage to extract further information
3287         request = compat_urllib_request.Request(url)
3288         request.add_header('Cookie', 'family_filter=off')
3289         try:
3290             self.report_download_webpage(video_id)
3291             webpage_bytes = compat_urllib_request.urlopen(request).read()
3292             webpage = webpage_bytes.decode('utf-8')
3293         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3294             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
3295             return
3296
3297         # Extract URL, uploader and title from webpage
3298         self.report_extraction(video_id)
3299         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
3300         if mobj is None:
3301             self._downloader.trouble(u'ERROR: unable to extract media URL')
3302             return
3303         flashvars = compat_urllib_parse.unquote(mobj.group(1))
3304
3305         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
3306             if key in flashvars:
3307                 max_quality = key
3308                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
3309                 break
3310         else:
3311             self._downloader.trouble(u'ERROR: unable to extract video URL')
3312             return
3313
3314         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
3315         if mobj is None:
3316             self._downloader.trouble(u'ERROR: unable to extract video URL')
3317             return
3318
3319         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
3320
3321         # TODO: support choosing qualities
3322
3323         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
3324         if mobj is None:
3325             self._downloader.trouble(u'ERROR: unable to extract title')
3326             return
3327         video_title = unescapeHTML(mobj.group('title'))
3328
3329         video_uploader = None
3330         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
3331         if mobj is None:
3332             # lookin for official user
3333             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
3334             if mobj_official is None:
3335                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
3336             else:
3337                 video_uploader = mobj_official.group(1)
3338         else:
3339             video_uploader = mobj.group(1)
3340
3341         video_upload_date = None
3342         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
3343         if mobj is not None:
3344             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
3345
3346         return [{
3347             'id':       video_id,
3348             'url':      video_url,
3349             'uploader': video_uploader,
3350             'upload_date':  video_upload_date,
3351             'title':    video_title,
3352             'ext':      video_extension,
3353         }]
3354
3355
3356 class PhotobucketIE(InfoExtractor):
3357     """Information extractor for photobucket.com."""
3358
3359     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
3360     IE_NAME = u'photobucket'
3361
3362     def __init__(self, downloader=None):
3363         InfoExtractor.__init__(self, downloader)
3364
3365     def report_download_webpage(self, video_id):
3366         """Report webpage download."""
3367         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
3368
3369     def report_extraction(self, video_id):
3370         """Report information extraction."""
3371         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
3372
3373     def _real_extract(self, url):
3374         # Extract id from URL
3375         mobj = re.match(self._VALID_URL, url)
3376         if mobj is None:
3377             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3378             return
3379
3380         video_id = mobj.group(1)
3381
3382         video_extension = 'flv'
3383
3384         # Retrieve video webpage to extract further information
3385         request = compat_urllib_request.Request(url)
3386         try:
3387             self.report_download_webpage(video_id)
3388             webpage = compat_urllib_request.urlopen(request).read()
3389         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3390             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3391             return
3392
3393         # Extract URL, uploader, and title from webpage
3394         self.report_extraction(video_id)
3395         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
3396         if mobj is None:
3397             self._downloader.trouble(u'ERROR: unable to extract media URL')
3398             return
3399         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
3400
3401         video_url = mediaURL
3402
3403         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
3404         if mobj is None:
3405             self._downloader.trouble(u'ERROR: unable to extract title')
3406             return
3407         video_title = mobj.group(1).decode('utf-8')
3408
3409         video_uploader = mobj.group(2).decode('utf-8')
3410
3411         return [{
3412             'id':       video_id.decode('utf-8'),
3413             'url':      video_url.decode('utf-8'),
3414             'uploader': video_uploader,
3415             'upload_date':  None,
3416             'title':    video_title,
3417             'ext':      video_extension.decode('utf-8'),
3418         }]
3419
3420
3421 class YahooIE(InfoExtractor):
3422     """Information extractor for video.yahoo.com."""
3423
3424     _WORKING = False
3425     # _VALID_URL matches all Yahoo! Video URLs
3426     # _VPAGE_URL matches only the extractable '/watch/' URLs
3427     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
3428     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
3429     IE_NAME = u'video.yahoo'
3430
3431     def __init__(self, downloader=None):
3432         InfoExtractor.__init__(self, downloader)
3433
3434     def report_download_webpage(self, video_id):
3435         """Report webpage download."""
3436         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
3437
3438     def report_extraction(self, video_id):
3439         """Report information extraction."""
3440         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
3441
3442     def _real_extract(self, url, new_video=True):
3443         # Extract ID from URL
3444         mobj = re.match(self._VALID_URL, url)
3445         if mobj is None:
3446             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3447             return
3448
3449         video_id = mobj.group(2)
3450         video_extension = 'flv'
3451
3452         # Rewrite valid but non-extractable URLs as
3453         # extractable English language /watch/ URLs
3454         if re.match(self._VPAGE_URL, url) is None:
3455             request = compat_urllib_request.Request(url)
3456             try:
3457                 webpage = compat_urllib_request.urlopen(request).read()
3458             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3459                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3460                 return
3461
3462             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
3463             if mobj is None:
3464                 self._downloader.trouble(u'ERROR: Unable to extract id field')
3465                 return
3466             yahoo_id = mobj.group(1)
3467
3468             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
3469             if mobj is None:
3470                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
3471                 return
3472             yahoo_vid = mobj.group(1)
3473
3474             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
3475             return self._real_extract(url, new_video=False)
3476
3477         # Retrieve video webpage to extract further information
3478         request = compat_urllib_request.Request(url)
3479         try:
3480             self.report_download_webpage(video_id)
3481             webpage = compat_urllib_request.urlopen(request).read()
3482         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3483             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3484             return
3485
3486         # Extract uploader and title from webpage
3487         self.report_extraction(video_id)
3488         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
3489         if mobj is None:
3490             self._downloader.trouble(u'ERROR: unable to extract video title')
3491             return
3492         video_title = mobj.group(1).decode('utf-8')
3493
3494         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
3495         if mobj is None:
3496             self._downloader.trouble(u'ERROR: unable to extract video uploader')
3497             return
3498         video_uploader = mobj.group(1).decode('utf-8')
3499
3500         # Extract video thumbnail
3501         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
3502         if mobj is None:
3503             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3504             return
3505         video_thumbnail = mobj.group(1).decode('utf-8')
3506
3507         # Extract video description
3508         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
3509         if mobj is None:
3510             self._downloader.trouble(u'ERROR: unable to extract video description')
3511             return
3512         video_description = mobj.group(1).decode('utf-8')
3513         if not video_description:
3514             video_description = 'No description available.'
3515
3516         # Extract video height and width
3517         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
3518         if mobj is None:
3519             self._downloader.trouble(u'ERROR: unable to extract video height')
3520             return
3521         yv_video_height = mobj.group(1)
3522
3523         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
3524         if mobj is None:
3525             self._downloader.trouble(u'ERROR: unable to extract video width')
3526             return
3527         yv_video_width = mobj.group(1)
3528
3529         # Retrieve video playlist to extract media URL
3530         # I'm not completely sure what all these options are, but we
3531         # seem to need most of them, otherwise the server sends a 401.
3532         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
3533         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
3534         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
3535                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
3536                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
3537         try:
3538             self.report_download_webpage(video_id)
3539             webpage = compat_urllib_request.urlopen(request).read()
3540         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3541             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3542             return
3543
3544         # Extract media URL from playlist XML
3545         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
3546         if mobj is None:
3547             self._downloader.trouble(u'ERROR: Unable to extract media URL')
3548             return
3549         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
3550         video_url = unescapeHTML(video_url)
3551
3552         return [{
3553             'id':       video_id.decode('utf-8'),
3554             'url':      video_url,
3555             'uploader': video_uploader,
3556             'upload_date':  None,
3557             'title':    video_title,
3558             'ext':      video_extension.decode('utf-8'),
3559             'thumbnail':    video_thumbnail.decode('utf-8'),
3560             'description':  video_description,
3561         }]
3562
3563
3564 class VimeoIE(InfoExtractor):
3565     """Information extractor for vimeo.com."""
3566
3567     # _VALID_URL matches Vimeo URLs
3568     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
3569     IE_NAME = u'vimeo'
3570
3571     def __init__(self, downloader=None):
3572         InfoExtractor.__init__(self, downloader)
3573
3574     def report_download_webpage(self, video_id):
3575         """Report webpage download."""
3576         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
3577
3578     def report_extraction(self, video_id):
3579         """Report information extraction."""
3580         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
3581
3582     def _real_extract(self, url, new_video=True):
3583         # Extract ID from URL
3584         mobj = re.match(self._VALID_URL, url)
3585         if mobj is None:
3586             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3587             return
3588
3589         video_id = mobj.group(1)
3590
3591         # Retrieve video webpage to extract further information
3592         request = compat_urllib_request.Request(url, None, std_headers)
3593         try:
3594             self.report_download_webpage(video_id)
3595             webpage_bytes = compat_urllib_request.urlopen(request).read()
3596             webpage = webpage_bytes.decode('utf-8')
3597         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3598             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3599             return
3600
3601         # Now we begin extracting as much information as we can from what we
3602         # retrieved. First we extract the information common to all extractors,
3603         # and latter we extract those that are Vimeo specific.
3604         self.report_extraction(video_id)
3605
3606         # Extract the config JSON
3607         try:
3608             config = webpage.split(' = {config:')[1].split(',assets:')[0]
3609             config = json.loads(config)
3610         except:
3611             self._downloader.trouble(u'ERROR: unable to extract info section')
3612             return
3613
3614         # Extract title
3615         video_title = config["video"]["title"]
3616
3617         # Extract uploader and uploader_id
3618         video_uploader = config["video"]["owner"]["name"]
3619         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
3620
3621         # Extract video thumbnail
3622         video_thumbnail = config["video"]["thumbnail"]
3623
3624         # Extract video description
3625         video_description = get_element_by_attribute("itemprop", "description", webpage)
3626         if video_description: video_description = clean_html(video_description)
3627         else: video_description = ''
3628
3629         # Extract upload date
3630         video_upload_date = None
3631         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
3632         if mobj is not None:
3633             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
3634
3635         # Vimeo specific: extract request signature and timestamp
3636         sig = config['request']['signature']
3637         timestamp = config['request']['timestamp']
3638
3639         # Vimeo specific: extract video codec and quality information
3640         # First consider quality, then codecs, then take everything
3641         # TODO bind to format param
3642         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
3643         files = { 'hd': [], 'sd': [], 'other': []}
3644         for codec_name, codec_extension in codecs:
3645             if codec_name in config["video"]["files"]:
3646                 if 'hd' in config["video"]["files"][codec_name]:
3647                     files['hd'].append((codec_name, codec_extension, 'hd'))
3648                 elif 'sd' in config["video"]["files"][codec_name]:
3649                     files['sd'].append((codec_name, codec_extension, 'sd'))
3650                 else:
3651                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
3652
3653         for quality in ('hd', 'sd', 'other'):
3654             if len(files[quality]) > 0:
3655                 video_quality = files[quality][0][2]
3656                 video_codec = files[quality][0][0]
3657                 video_extension = files[quality][0][1]
3658                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
3659                 break
3660         else:
3661             self._downloader.trouble(u'ERROR: no known codec found')
3662             return
3663
3664         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
3665                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
3666
3667         return [{
3668             'id':       video_id,
3669             'url':      video_url,
3670             'uploader': video_uploader,
3671             'uploader_id': video_uploader_id,
3672             'upload_date':  video_upload_date,
3673             'title':    video_title,
3674             'ext':      video_extension,
3675             'thumbnail':    video_thumbnail,
3676             'description':  video_description,
3677         }]
3678
3679
3680 class ArteTvIE(InfoExtractor):
3681     """arte.tv information extractor."""
3682
3683     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
3684     _LIVE_URL = r'index-[0-9]+\.html$'
3685
3686     IE_NAME = u'arte.tv'
3687
3688     def __init__(self, downloader=None):
3689         InfoExtractor.__init__(self, downloader)
3690
3691     def report_download_webpage(self, video_id):
3692         """Report webpage download."""
3693         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
3694
3695     def report_extraction(self, video_id):
3696         """Report information extraction."""
3697         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
3698
3699     def fetch_webpage(self, url):
3700         self._downloader.increment_downloads()
3701         request = compat_urllib_request.Request(url)
3702         try:
3703             self.report_download_webpage(url)
3704             webpage = compat_urllib_request.urlopen(request).read()
3705         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3706             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3707             return
3708         except ValueError as err:
3709             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3710             return
3711         return webpage
3712
3713     def grep_webpage(self, url, regex, regexFlags, matchTuples):
3714         page = self.fetch_webpage(url)
3715         mobj = re.search(regex, page, regexFlags)
3716         info = {}
3717
3718         if mobj is None:
3719             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3720             return
3721
3722         for (i, key, err) in matchTuples:
3723             if mobj.group(i) is None:
3724                 self._downloader.trouble(err)
3725                 return
3726             else:
3727                 info[key] = mobj.group(i)
3728
3729         return info
3730
3731     def extractLiveStream(self, url):
3732         video_lang = url.split('/')[-4]
3733         info = self.grep_webpage(
3734             url,
3735             r'src="(.*?/videothek_js.*?\.js)',
3736             0,
3737             [
3738                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
3739             ]
3740         )
3741         http_host = url.split('/')[2]
3742         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
3743         info = self.grep_webpage(
3744             next_url,
3745             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
3746                 '(http://.*?\.swf).*?' +
3747                 '(rtmp://.*?)\'',
3748             re.DOTALL,
3749             [
3750                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
3751                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
3752                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
3753             ]
3754         )
3755         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
3756
3757     def extractPlus7Stream(self, url):
3758         video_lang = url.split('/')[-3]
3759         info = self.grep_webpage(
3760             url,
3761             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
3762             0,
3763             [
3764                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
3765             ]
3766         )
3767         next_url = compat_urllib_parse.unquote(info.get('url'))
3768         info = self.grep_webpage(
3769             next_url,
3770             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
3771             0,
3772             [
3773                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
3774             ]
3775         )
3776         next_url = compat_urllib_parse.unquote(info.get('url'))
3777
3778         info = self.grep_webpage(
3779             next_url,
3780             r'<video id="(.*?)".*?>.*?' +
3781                 '<name>(.*?)</name>.*?' +
3782                 '<dateVideo>(.*?)</dateVideo>.*?' +
3783                 '<url quality="hd">(.*?)</url>',
3784             re.DOTALL,
3785             [
3786                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
3787                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
3788                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
3789                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
3790             ]
3791         )
3792
3793         return {
3794             'id':           info.get('id'),
3795             'url':          compat_urllib_parse.unquote(info.get('url')),
3796             'uploader':     u'arte.tv',
3797             'upload_date':  info.get('date'),
3798             'title':        info.get('title').decode('utf-8'),
3799             'ext':          u'mp4',
3800             'format':       u'NA',
3801             'player_url':   None,
3802         }
3803
3804     def _real_extract(self, url):
3805         video_id = url.split('/')[-1]
3806         self.report_extraction(video_id)
3807
3808         if re.search(self._LIVE_URL, video_id) is not None:
3809             self.extractLiveStream(url)
3810             return
3811         else:
3812             info = self.extractPlus7Stream(url)
3813
3814         return [info]
3815
3816
3817 class GenericIE(InfoExtractor):
3818     """Generic last-resort information extractor."""
3819
3820     _VALID_URL = r'.*'
3821     IE_NAME = u'generic'
3822
3823     def __init__(self, downloader=None):
3824         InfoExtractor.__init__(self, downloader)
3825
3826     def report_download_webpage(self, video_id):
3827         """Report webpage download."""
3828         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
3829         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
3830
3831     def report_extraction(self, video_id):
3832         """Report information extraction."""
3833         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
3834
3835     def report_following_redirect(self, new_url):
3836         """Report information extraction."""
3837         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
3838
3839     def _test_redirect(self, url):
3840         """Check if it is a redirect, like url shorteners, in case restart chain."""
3841         class HeadRequest(compat_urllib_request.Request):
3842             def get_method(self):
3843                 return "HEAD"
3844
3845         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
3846             """
3847             Subclass the HTTPRedirectHandler to make it use our
3848             HeadRequest also on the redirected URL
3849             """
3850             def redirect_request(self, req, fp, code, msg, headers, newurl):
3851                 if code in (301, 302, 303, 307):
3852                     newurl = newurl.replace(' ', '%20')
3853                     newheaders = dict((k,v) for k,v in req.headers.items()
3854                                       if k.lower() not in ("content-length", "content-type"))
3855                     return HeadRequest(newurl,
3856                                        headers=newheaders,
3857                                        origin_req_host=req.get_origin_req_host(),
3858                                        unverifiable=True)
3859                 else:
3860                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
3861
3862         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
3863             """
3864             Fallback to GET if HEAD is not allowed (405 HTTP error)
3865             """
3866             def http_error_405(self, req, fp, code, msg, headers):
3867                 fp.read()
3868                 fp.close()
3869
3870                 newheaders = dict((k,v) for k,v in req.headers.items()
3871                                   if k.lower() not in ("content-length", "content-type"))
3872                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
3873                                                  headers=newheaders,
3874                                                  origin_req_host=req.get_origin_req_host(),
3875                                                  unverifiable=True))
3876
3877         # Build our opener
3878         opener = compat_urllib_request.OpenerDirector()
3879         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
3880                         HTTPMethodFallback, HEADRedirectHandler,
3881                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
3882             opener.add_handler(handler())
3883
3884         response = opener.open(HeadRequest(url))
3885         new_url = response.geturl()
3886
3887         if url == new_url:
3888             return False
3889
3890         self.report_following_redirect(new_url)
3891         self._downloader.download([new_url])
3892         return True
3893
3894     def _real_extract(self, url):
3895         if self._test_redirect(url): return
3896
3897         video_id = url.split('/')[-1]
3898         request = compat_urllib_request.Request(url)
3899         try:
3900             self.report_download_webpage(video_id)
3901             webpage = compat_urllib_request.urlopen(request).read()
3902         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3903             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3904             return
3905         except ValueError as err:
3906             # since this is the last-resort InfoExtractor, if
3907             # this error is thrown, it'll be thrown here
3908             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3909             return
3910
3911         self.report_extraction(video_id)
3912         # Start with something easy: JW Player in SWFObject
3913         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
3914         if mobj is None:
3915             # Broaden the search a little bit
3916             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
3917         if mobj is None:
3918             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3919             return
3920
3921         # It's possible that one of the regexes
3922         # matched, but returned an empty group:
3923         if mobj.group(1) is None:
3924             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3925             return
3926
3927         video_url = compat_urllib_parse.unquote(mobj.group(1))
3928         video_id = os.path.basename(video_url)
3929
3930         # here's a fun little line of code for you:
3931         video_extension = os.path.splitext(video_id)[1][1:]
3932         video_id = os.path.splitext(video_id)[0]
3933
3934         # it's tempting to parse this further, but you would
3935         # have to take into account all the variations like
3936         #   Video Title - Site Name
3937         #   Site Name | Video Title
3938         #   Video Title - Tagline | Site Name
3939         # and so on and so forth; it's just not practical
3940         mobj = re.search(r'<title>(.*)</title>', webpage)
3941         if mobj is None:
3942             self._downloader.trouble(u'ERROR: unable to extract title')
3943             return
3944         video_title = mobj.group(1)
3945
3946         # video uploader is domain name
3947         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
3948         if mobj is None:
3949             self._downloader.trouble(u'ERROR: unable to extract title')
3950             return
3951         video_uploader = mobj.group(1)
3952
3953         return [{
3954             'id':       video_id,
3955             'url':      video_url,
3956             'uploader': video_uploader,
3957             'upload_date':  None,
3958             'title':    video_title,
3959             'ext':      video_extension,
3960         }]
3961
3962
3963 class YoutubeSearchIE(InfoExtractor):
3964     """Information Extractor for YouTube search queries."""
3965     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
3966     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
3967     _max_youtube_results = 1000
3968     IE_NAME = u'youtube:search'
3969
3970     def __init__(self, downloader=None):
3971         InfoExtractor.__init__(self, downloader)
3972
3973     def report_download_page(self, query, pagenum):
3974         """Report attempt to download search page with given number."""
3975         query = query.decode(preferredencoding())
3976         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
3977
3978     def _real_extract(self, query):
3979         mobj = re.match(self._VALID_URL, query)
3980         if mobj is None:
3981             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
3982             return
3983
3984         prefix, query = query.split(':')
3985         prefix = prefix[8:]
3986         query = query.encode('utf-8')
3987         if prefix == '':
3988             self._download_n_results(query, 1)
3989             return
3990         elif prefix == 'all':
3991             self._download_n_results(query, self._max_youtube_results)
3992             return
3993         else:
3994             try:
3995                 n = int(prefix)
3996                 if n <= 0:
3997                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
3998                     return
3999                 elif n > self._max_youtube_results:
4000                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
4001                     n = self._max_youtube_results
4002                 self._download_n_results(query, n)
4003                 return
4004             except ValueError: # parsing prefix as integer fails
4005                 self._download_n_results(query, 1)
4006                 return
4007
4008     def _download_n_results(self, query, n):
4009         """Downloads a specified number of results for a query"""
4010
4011         video_ids = []
4012         pagenum = 0
4013         limit = n
4014
4015         while (50 * pagenum) < limit:
4016             self.report_download_page(query, pagenum+1)
4017             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
4018             request = compat_urllib_request.Request(result_url)
4019             try:
4020                 data = compat_urllib_request.urlopen(request).read()
4021             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4022                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
4023                 return
4024             api_response = json.loads(data)['data']
4025
4026             new_ids = list(video['id'] for video in api_response['items'])
4027             video_ids += new_ids
4028
4029             limit = min(n, api_response['totalItems'])
4030             pagenum += 1
4031
4032         if len(video_ids) > n:
4033             video_ids = video_ids[:n]
4034         for id in video_ids:
4035             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
4036         return
4037
4038
4039 class GoogleSearchIE(InfoExtractor):
4040     """Information Extractor for Google Video search queries."""
4041     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
4042     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
4043     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
4044     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
4045     _max_google_results = 1000
4046     IE_NAME = u'video.google:search'
4047
4048     def __init__(self, downloader=None):
4049         InfoExtractor.__init__(self, downloader)
4050
4051     def report_download_page(self, query, pagenum):
4052         """Report attempt to download playlist page with given number."""
4053         query = query.decode(preferredencoding())
4054         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
4055
4056     def _real_extract(self, query):
4057         mobj = re.match(self._VALID_URL, query)
4058         if mobj is None:
4059             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
4060             return
4061
4062         prefix, query = query.split(':')
4063         prefix = prefix[8:]
4064         query = query.encode('utf-8')
4065         if prefix == '':
4066             self._download_n_results(query, 1)
4067             return
4068         elif prefix == 'all':
4069             self._download_n_results(query, self._max_google_results)
4070             return
4071         else:
4072             try:
4073                 n = int(prefix)
4074                 if n <= 0:
4075                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
4076                     return
4077                 elif n > self._max_google_results:
4078                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
4079                     n = self._max_google_results
4080                 self._download_n_results(query, n)
4081                 return
4082             except ValueError: # parsing prefix as integer fails
4083                 self._download_n_results(query, 1)
4084                 return
4085
4086     def _download_n_results(self, query, n):
4087         """Downloads a specified number of results for a query"""
4088
4089         video_ids = []
4090         pagenum = 0
4091
4092         while True:
4093             self.report_download_page(query, pagenum)
4094             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
4095             request = compat_urllib_request.Request(result_url)
4096             try:
4097                 page = compat_urllib_request.urlopen(request).read()
4098             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4099                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
4100                 return
4101
4102             # Extract video identifiers
4103             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
4104                 video_id = mobj.group(1)
4105                 if video_id not in video_ids:
4106                     video_ids.append(video_id)
4107                     if len(video_ids) == n:
4108                         # Specified n videos reached
4109                         for id in video_ids:
4110                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
4111                         return
4112
4113             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
4114                 for id in video_ids:
4115                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
4116                 return
4117
4118             pagenum = pagenum + 1
4119
4120
4121 class YahooSearchIE(InfoExtractor):
4122     """Information Extractor for Yahoo! Video search queries."""
4123
4124     _WORKING = False
4125     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
4126     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
4127     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
4128     _MORE_PAGES_INDICATOR = r'\s*Next'
4129     _max_yahoo_results = 1000
4130     IE_NAME = u'video.yahoo:search'
4131
4132     def __init__(self, downloader=None):
4133         InfoExtractor.__init__(self, downloader)
4134
4135     def report_download_page(self, query, pagenum):
4136         """Report attempt to download playlist page with given number."""
4137         query = query.decode(preferredencoding())
4138         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
4139
4140     def _real_extract(self, query):
4141         mobj = re.match(self._VALID_URL, query)
4142         if mobj is None:
4143             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
4144             return
4145
4146         prefix, query = query.split(':')
4147         prefix = prefix[8:]
4148         query = query.encode('utf-8')
4149         if prefix == '':
4150             self._download_n_results(query, 1)
4151             return
4152         elif prefix == 'all':
4153             self._download_n_results(query, self._max_yahoo_results)
4154             return
4155         else:
4156             try:
4157                 n = int(prefix)
4158                 if n <= 0:
4159                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
4160                     return
4161                 elif n > self._max_yahoo_results:
4162                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
4163                     n = self._max_yahoo_results
4164                 self._download_n_results(query, n)
4165                 return
4166             except ValueError: # parsing prefix as integer fails
4167                 self._download_n_results(query, 1)
4168                 return
4169
4170     def _download_n_results(self, query, n):
4171         """Downloads a specified number of results for a query"""
4172
4173         video_ids = []
4174         already_seen = set()
4175         pagenum = 1
4176
4177         while True:
4178             self.report_download_page(query, pagenum)
4179             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
4180             request = compat_urllib_request.Request(result_url)
4181             try:
4182                 page = compat_urllib_request.urlopen(request).read()
4183             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4184                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
4185                 return
4186
4187             # Extract video identifiers
4188             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
4189                 video_id = mobj.group(1)
4190                 if video_id not in already_seen:
4191                     video_ids.append(video_id)
4192                     already_seen.add(video_id)
4193                     if len(video_ids) == n:
4194                         # Specified n videos reached
4195                         for id in video_ids:
4196                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
4197                         return
4198
4199             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
4200                 for id in video_ids:
4201                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
4202                 return
4203
4204             pagenum = pagenum + 1
4205
4206
4207 class YoutubePlaylistIE(InfoExtractor):
4208     """Information Extractor for YouTube playlists."""
4209
4210     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
4211     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
4212     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
4213     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
4214     IE_NAME = u'youtube:playlist'
4215
4216     def __init__(self, downloader=None):
4217         InfoExtractor.__init__(self, downloader)
4218
4219     def report_download_page(self, playlist_id, pagenum):
4220         """Report attempt to download playlist page with given number."""
4221         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
4222
4223     def _real_extract(self, url):
4224         # Extract playlist id
4225         mobj = re.match(self._VALID_URL, url)
4226         if mobj is None:
4227             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
4228             return
4229
4230         # Single video case
4231         if mobj.group(3) is not None:
4232             self._downloader.download([mobj.group(3)])
4233             return
4234
4235         # Download playlist pages
4236         # prefix is 'p' as default for playlists but there are other types that need extra care
4237         playlist_prefix = mobj.group(1)
4238         if playlist_prefix == 'a':
4239             playlist_access = 'artist'
4240         else:
4241             playlist_prefix = 'p'
4242             playlist_access = 'view_play_list'
4243         playlist_id = mobj.group(2)
4244         video_ids = []
4245         pagenum = 1
4246
4247         while True:
4248             self.report_download_page(playlist_id, pagenum)
4249             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
4250             request = compat_urllib_request.Request(url)
4251             try:
4252                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
4253             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4254                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
4255                 return
4256
4257             # Extract video identifiers
4258             ids_in_page = []
4259             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
4260                 if mobj.group(1) not in ids_in_page:
4261                     ids_in_page.append(mobj.group(1))
4262             video_ids.extend(ids_in_page)
4263
4264             if self._MORE_PAGES_INDICATOR not in page:
4265                 break
4266             pagenum = pagenum + 1
4267
4268         total = len(video_ids)
4269
4270         playliststart = self._downloader.params.get('playliststart', 1) - 1
4271         playlistend = self._downloader.params.get('playlistend', -1)
4272         if playlistend == -1:
4273             video_ids = video_ids[playliststart:]
4274         else:
4275             video_ids = video_ids[playliststart:playlistend]
4276
4277         if len(video_ids) == total:
4278             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
4279         else:
4280             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
4281
4282         for id in video_ids:
4283             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
4284         return
4285
4286
4287 class YoutubeChannelIE(InfoExtractor):
4288     """Information Extractor for YouTube channels."""
4289
4290     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
4291     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
4292     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
4293     IE_NAME = u'youtube:channel'
4294
4295     def report_download_page(self, channel_id, pagenum):
4296         """Report attempt to download channel page with given number."""
4297         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
4298
4299     def _real_extract(self, url):
4300         # Extract channel id
4301         mobj = re.match(self._VALID_URL, url)
4302         if mobj is None:
4303             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
4304             return
4305
4306         # Download channel pages
4307         channel_id = mobj.group(1)
4308         video_ids = []
4309         pagenum = 1
4310
4311         while True:
4312             self.report_download_page(channel_id, pagenum)
4313             url = self._TEMPLATE_URL % (channel_id, pagenum)
4314             request = compat_urllib_request.Request(url)
4315             try:
4316                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
4317             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4318                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
4319                 return
4320
4321             # Extract video identifiers
4322             ids_in_page = []
4323             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
4324                 if mobj.group(1) not in ids_in_page:
4325                     ids_in_page.append(mobj.group(1))
4326             video_ids.extend(ids_in_page)
4327
4328             if self._MORE_PAGES_INDICATOR not in page:
4329                 break
4330             pagenum = pagenum + 1
4331
4332         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
4333
4334         for id in video_ids:
4335             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
4336         return
4337
4338
4339 class YoutubeUserIE(InfoExtractor):
4340     """Information Extractor for YouTube users."""
4341
4342     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
4343     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
4344     _GDATA_PAGE_SIZE = 50
4345     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
4346     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
4347     IE_NAME = u'youtube:user'
4348
4349     def __init__(self, downloader=None):
4350         InfoExtractor.__init__(self, downloader)
4351
4352     def report_download_page(self, username, start_index):
4353         """Report attempt to download user page."""
4354         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
4355                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
4356
4357     def _real_extract(self, url):
4358         # Extract username
4359         mobj = re.match(self._VALID_URL, url)
4360         if mobj is None:
4361             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
4362             return
4363
4364         username = mobj.group(1)
4365
4366         # Download video ids using YouTube Data API. Result size per
4367         # query is limited (currently to 50 videos) so we need to query
4368         # page by page until there are no video ids - it means we got
4369         # all of them.
4370
4371         video_ids = []
4372         pagenum = 0
4373
4374         while True:
4375             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
4376             self.report_download_page(username, start_index)
4377
4378             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
4379
4380             try:
4381                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
4382             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4383                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
4384                 return
4385
4386             # Extract video identifiers
4387             ids_in_page = []
4388
4389             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
4390                 if mobj.group(1) not in ids_in_page:
4391                     ids_in_page.append(mobj.group(1))
4392
4393             video_ids.extend(ids_in_page)
4394
4395             # A little optimization - if current page is not
4396             # "full", ie. does not contain PAGE_SIZE video ids then
4397             # we can assume that this page is the last one - there
4398             # are no more ids on further pages - no need to query
4399             # again.
4400
4401             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
4402                 break
4403
4404             pagenum += 1
4405
4406         all_ids_count = len(video_ids)
4407         playliststart = self._downloader.params.get('playliststart', 1) - 1
4408         playlistend = self._downloader.params.get('playlistend', -1)
4409
4410         if playlistend == -1:
4411             video_ids = video_ids[playliststart:]
4412         else:
4413             video_ids = video_ids[playliststart:playlistend]
4414
4415         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
4416                 (username, all_ids_count, len(video_ids)))
4417
4418         for video_id in video_ids:
4419             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
4420
4421
4422 class BlipTVUserIE(InfoExtractor):
4423     """Information Extractor for blip.tv users."""
4424
4425     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
4426     _PAGE_SIZE = 12
4427     IE_NAME = u'blip.tv:user'
4428
4429     def __init__(self, downloader=None):
4430         InfoExtractor.__init__(self, downloader)
4431
4432     def report_download_page(self, username, pagenum):
4433         """Report attempt to download user page."""
4434         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
4435                 (self.IE_NAME, username, pagenum))
4436
4437     def _real_extract(self, url):
4438         # Extract username
4439         mobj = re.match(self._VALID_URL, url)
4440         if mobj is None:
4441             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
4442             return
4443
4444         username = mobj.group(1)
4445
4446         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
4447
4448         request = compat_urllib_request.Request(url)
4449
4450         try:
4451             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
4452             mobj = re.search(r'data-users-id="([^"]+)"', page)
4453             page_base = page_base % mobj.group(1)
4454         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4455             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
4456             return
4457
4458
4459         # Download video ids using BlipTV Ajax calls. Result size per
4460         # query is limited (currently to 12 videos) so we need to query
4461         # page by page until there are no video ids - it means we got
4462         # all of them.
4463
4464         video_ids = []
4465         pagenum = 1
4466
4467         while True:
4468             self.report_download_page(username, pagenum)
4469
4470             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
4471
4472             try:
4473                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
4474             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4475                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
4476                 return
4477
4478             # Extract video identifiers
4479             ids_in_page = []
4480
4481             for mobj in re.finditer(r'href="/([^"]+)"', page):
4482                 if mobj.group(1) not in ids_in_page:
4483                     ids_in_page.append(unescapeHTML(mobj.group(1)))
4484
4485             video_ids.extend(ids_in_page)
4486
4487             # A little optimization - if current page is not
4488             # "full", ie. does not contain PAGE_SIZE video ids then
4489             # we can assume that this page is the last one - there
4490             # are no more ids on further pages - no need to query
4491             # again.
4492
4493             if len(ids_in_page) < self._PAGE_SIZE:
4494                 break
4495
4496             pagenum += 1
4497
4498         all_ids_count = len(video_ids)
4499         playliststart = self._downloader.params.get('playliststart', 1) - 1
4500         playlistend = self._downloader.params.get('playlistend', -1)
4501
4502         if playlistend == -1:
4503             video_ids = video_ids[playliststart:]
4504         else:
4505             video_ids = video_ids[playliststart:playlistend]
4506
4507         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
4508                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
4509
4510         for video_id in video_ids:
4511             self._downloader.download([u'http://blip.tv/'+video_id])
4512
4513
4514 class DepositFilesIE(InfoExtractor):
4515     """Information extractor for depositfiles.com"""
4516
4517     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
4518     IE_NAME = u'DepositFiles'
4519
4520     def __init__(self, downloader=None):
4521         InfoExtractor.__init__(self, downloader)
4522
4523     def report_download_webpage(self, file_id):
4524         """Report webpage download."""
4525         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
4526
4527     def report_extraction(self, file_id):
4528         """Report information extraction."""
4529         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
4530
4531     def _real_extract(self, url):
4532         file_id = url.split('/')[-1]
4533         # Rebuild url in english locale
4534         url = 'http://depositfiles.com/en/files/' + file_id
4535
4536         # Retrieve file webpage with 'Free download' button pressed
4537         free_download_indication = { 'gateway_result' : '1' }
4538         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
4539         try:
4540             self.report_download_webpage(file_id)
4541             webpage = compat_urllib_request.urlopen(request).read()
4542         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4543             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
4544             return
4545
4546         # Search for the real file URL
4547         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
4548         if (mobj is None) or (mobj.group(1) is None):
4549             # Try to figure out reason of the error.
4550             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
4551             if (mobj is not None) and (mobj.group(1) is not None):
4552                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
4553                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
4554             else:
4555                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
4556             return
4557
4558         file_url = mobj.group(1)
4559         file_extension = os.path.splitext(file_url)[1][1:]
4560
4561         # Search for file title
4562         mobj = re.search(r'<b title="(.*?)">', webpage)
4563         if mobj is None:
4564             self._downloader.trouble(u'ERROR: unable to extract title')
4565             return
4566         file_title = mobj.group(1).decode('utf-8')
4567
4568         return [{
4569             'id':       file_id.decode('utf-8'),
4570             'url':      file_url.decode('utf-8'),
4571             'uploader': None,
4572             'upload_date':  None,
4573             'title':    file_title,
4574             'ext':      file_extension.decode('utf-8'),
4575         }]
4576
4577
4578 class FacebookIE(InfoExtractor):
4579     """Information Extractor for Facebook"""
4580
4581     _WORKING = False
4582     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
4583     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
4584     _NETRC_MACHINE = 'facebook'
4585     _available_formats = ['video', 'highqual', 'lowqual']
4586     _video_extensions = {
4587         'video': 'mp4',
4588         'highqual': 'mp4',
4589         'lowqual': 'mp4',
4590     }
4591     IE_NAME = u'facebook'
4592
4593     def __init__(self, downloader=None):
4594         InfoExtractor.__init__(self, downloader)
4595
4596     def _reporter(self, message):
4597         """Add header and report message."""
4598         self._downloader.to_screen(u'[facebook] %s' % message)
4599
4600     def report_login(self):
4601         """Report attempt to log in."""
4602         self._reporter(u'Logging in')
4603
4604     def report_video_webpage_download(self, video_id):
4605         """Report attempt to download video webpage."""
4606         self._reporter(u'%s: Downloading video webpage' % video_id)
4607
4608     def report_information_extraction(self, video_id):
4609         """Report attempt to extract video information."""
4610         self._reporter(u'%s: Extracting video information' % video_id)
4611
4612     def _parse_page(self, video_webpage):
4613         """Extract video information from page"""
4614         # General data
4615         data = {'title': r'\("video_title", "(.*?)"\)',
4616             'description': r'<div class="datawrap">(.*?)</div>',
4617             'owner': r'\("video_owner_name", "(.*?)"\)',
4618             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
4619             }
4620         video_info = {}
4621         for piece in data.keys():
4622             mobj = re.search(data[piece], video_webpage)
4623             if mobj is not None:
4624                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
4625
4626         # Video urls
4627         video_urls = {}
4628         for fmt in self._available_formats:
4629             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
4630             if mobj is not None:
4631                 # URL is in a Javascript segment inside an escaped Unicode format within
4632                 # the generally utf-8 page
4633                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
4634         video_info['video_urls'] = video_urls
4635
4636         return video_info
4637
4638     def _real_initialize(self):
4639         if self._downloader is None:
4640             return
4641
4642         useremail = None
4643         password = None
4644         downloader_params = self._downloader.params
4645
4646         # Attempt to use provided username and password or .netrc data
4647         if downloader_params.get('username', None) is not None:
4648             useremail = downloader_params['username']
4649             password = downloader_params['password']
4650         elif downloader_params.get('usenetrc', False):
4651             try:
4652                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
4653                 if info is not None:
4654                     useremail = info[0]
4655                     password = info[2]
4656                 else:
4657                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
4658             except (IOError, netrc.NetrcParseError) as err:
4659                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
4660                 return
4661
4662         if useremail is None:
4663             return
4664
4665         # Log in
4666         login_form = {
4667             'email': useremail,
4668             'pass': password,
4669             'login': 'Log+In'
4670             }
4671         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
4672         try:
4673             self.report_login()
4674             login_results = compat_urllib_request.urlopen(request).read()
4675             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
4676                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
4677                 return
4678         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4679             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
4680             return
4681
4682     def _real_extract(self, url):
4683         mobj = re.match(self._VALID_URL, url)
4684         if mobj is None:
4685             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4686             return
4687         video_id = mobj.group('ID')
4688
4689         # Get video webpage
4690         self.report_video_webpage_download(video_id)
4691         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
4692         try:
4693             page = compat_urllib_request.urlopen(request)
4694             video_webpage = page.read()
4695         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4696             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
4697             return
4698
4699         # Start extracting information
4700         self.report_information_extraction(video_id)
4701
4702         # Extract information
4703         video_info = self._parse_page(video_webpage)
4704
4705         # uploader
4706         if 'owner' not in video_info:
4707             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
4708             return
4709         video_uploader = video_info['owner']
4710
4711         # title
4712         if 'title' not in video_info:
4713             self._downloader.trouble(u'ERROR: unable to extract video title')
4714             return
4715         video_title = video_info['title']
4716         video_title = video_title.decode('utf-8')
4717
4718         # thumbnail image
4719         if 'thumbnail' not in video_info:
4720             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
4721             video_thumbnail = ''
4722         else:
4723             video_thumbnail = video_info['thumbnail']
4724
4725         # upload date
4726         upload_date = None
4727         if 'upload_date' in video_info:
4728             upload_time = video_info['upload_date']
4729             timetuple = email.utils.parsedate_tz(upload_time)
4730             if timetuple is not None:
4731                 try:
4732                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
4733                 except:
4734                     pass
4735
4736         # description
4737         video_description = video_info.get('description', 'No description available.')
4738
4739         url_map = video_info['video_urls']
4740         if url_map:
4741             # Decide which formats to download
4742             req_format = self._downloader.params.get('format', None)
4743             format_limit = self._downloader.params.get('format_limit', None)
4744
4745             if format_limit is not None and format_limit in self._available_formats:
4746                 format_list = self._available_formats[self._available_formats.index(format_limit):]
4747             else:
4748                 format_list = self._available_formats
4749             existing_formats = [x for x in format_list if x in url_map]
4750             if len(existing_formats) == 0:
4751                 self._downloader.trouble(u'ERROR: no known formats available for video')
4752                 return
4753             if req_format is None:
4754                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
4755             elif req_format == 'worst':
4756                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
4757             elif req_format == '-1':
4758                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
4759             else:
4760                 # Specific format
4761                 if req_format not in url_map:
4762                     self._downloader.trouble(u'ERROR: requested format not available')
4763                     return
4764                 video_url_list = [(req_format, url_map[req_format])] # Specific format
4765
4766         results = []
4767         for format_param, video_real_url in video_url_list:
4768             # Extension
4769             video_extension = self._video_extensions.get(format_param, 'mp4')
4770
4771             results.append({
4772                 'id':       video_id.decode('utf-8'),
4773                 'url':      video_real_url.decode('utf-8'),
4774                 'uploader': video_uploader.decode('utf-8'),
4775                 'upload_date':  upload_date,
4776                 'title':    video_title,
4777                 'ext':      video_extension.decode('utf-8'),
4778                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
4779                 'thumbnail':    video_thumbnail.decode('utf-8'),
4780                 'description':  video_description.decode('utf-8'),
4781             })
4782         return results
4783
4784 class BlipTVIE(InfoExtractor):
4785     """Information extractor for blip.tv"""
4786
4787     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
4788     _URL_EXT = r'^.*\.([a-z0-9]+)$'
4789     IE_NAME = u'blip.tv'
4790
4791     def report_extraction(self, file_id):
4792         """Report information extraction."""
4793         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
4794
4795     def report_direct_download(self, title):
4796         """Report information extraction."""
4797         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
4798
4799     def _real_extract(self, url):
4800         mobj = re.match(self._VALID_URL, url)
4801         if mobj is None:
4802             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4803             return
4804
4805         if '?' in url:
4806             cchar = '&'
4807         else:
4808             cchar = '?'
4809         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
4810         request = compat_urllib_request.Request(json_url)
4811         self.report_extraction(mobj.group(1))
4812         info = None
4813         try:
4814             urlh = compat_urllib_request.urlopen(request)
4815             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
4816                 basename = url.split('/')[-1]
4817                 title,ext = os.path.splitext(basename)
4818                 title = title.decode('UTF-8')
4819                 ext = ext.replace('.', '')
4820                 self.report_direct_download(title)
4821                 info = {
4822                     'id': title,
4823                     'url': url,
4824                     'uploader': None,
4825                     'upload_date': None,
4826                     'title': title,
4827                     'ext': ext,
4828                     'urlhandle': urlh
4829                 }
4830         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4831             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
4832             return
4833         if info is None: # Regular URL
4834             try:
4835                 json_code_bytes = urlh.read()
4836                 json_code = json_code_bytes.decode('utf-8')
4837             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4838                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
4839                 return
4840
4841             try:
4842                 json_data = json.loads(json_code)
4843                 if 'Post' in json_data:
4844                     data = json_data['Post']
4845                 else:
4846                     data = json_data
4847
4848                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
4849                 video_url = data['media']['url']
4850                 umobj = re.match(self._URL_EXT, video_url)
4851                 if umobj is None:
4852                     raise ValueError('Can not determine filename extension')
4853                 ext = umobj.group(1)
4854
4855                 info = {
4856                     'id': data['item_id'],
4857                     'url': video_url,
4858                     'uploader': data['display_name'],
4859                     'upload_date': upload_date,
4860                     'title': data['title'],
4861                     'ext': ext,
4862                     'format': data['media']['mimeType'],
4863                     'thumbnail': data['thumbnailUrl'],
4864                     'description': data['description'],
4865                     'player_url': data['embedUrl']
4866                 }
4867             except (ValueError,KeyError) as err:
4868                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
4869                 return
4870
4871         std_headers['User-Agent'] = 'iTunes/10.6.1'
4872         return [info]
4873
4874
4875 class MyVideoIE(InfoExtractor):
4876     """Information Extractor for myvideo.de."""
4877
4878     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
4879     IE_NAME = u'myvideo'
4880
4881     def __init__(self, downloader=None):
4882         InfoExtractor.__init__(self, downloader)
4883
4884     def report_download_webpage(self, video_id):
4885         """Report webpage download."""
4886         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
4887
4888     def report_extraction(self, video_id):
4889         """Report information extraction."""
4890         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
4891
4892     def _real_extract(self,url):
4893         mobj = re.match(self._VALID_URL, url)
4894         if mobj is None:
4895             self._download.trouble(u'ERROR: invalid URL: %s' % url)
4896             return
4897
4898         video_id = mobj.group(1)
4899
4900         # Get video webpage
4901         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
4902         try:
4903             self.report_download_webpage(video_id)
4904             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
4905         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4906             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
4907             return
4908
4909         self.report_extraction(video_id)
4910         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
4911                  webpage)
4912         if mobj is None:
4913             self._downloader.trouble(u'ERROR: unable to extract media URL')
4914             return
4915         video_url = mobj.group(1) + ('/%s.flv' % video_id)
4916
4917         mobj = re.search('<title>([^<]+)</title>', webpage)
4918         if mobj is None:
4919             self._downloader.trouble(u'ERROR: unable to extract title')
4920             return
4921
4922         video_title = mobj.group(1)
4923
4924         return [{
4925             'id':       video_id,
4926             'url':      video_url,
4927             'uploader': None,
4928             'upload_date':  None,
4929             'title':    video_title,
4930             'ext':      u'flv',
4931         }]
4932
4933 class ComedyCentralIE(InfoExtractor):
4934     """Information extractor for The Daily Show and Colbert Report """
4935
4936     # urls can be abbreviations like :thedailyshow or :colbert
4937     # urls for episodes like:
4938     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
4939     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
4940     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
4941     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
4942                       |(https?://)?(www\.)?
4943                           (?P<showname>thedailyshow|colbertnation)\.com/
4944                          (full-episodes/(?P<episode>.*)|
4945                           (?P<clip>
4946                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
4947                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
4948                      $"""
4949     IE_NAME = u'comedycentral'
4950
4951     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
4952
4953     _video_extensions = {
4954         '3500': 'mp4',
4955         '2200': 'mp4',
4956         '1700': 'mp4',
4957         '1200': 'mp4',
4958         '750': 'mp4',
4959         '400': 'mp4',
4960     }
4961     _video_dimensions = {
4962         '3500': '1280x720',
4963         '2200': '960x540',
4964         '1700': '768x432',
4965         '1200': '640x360',
4966         '750': '512x288',
4967         '400': '384x216',
4968     }
4969
4970     def suitable(self, url):
4971         """Receives a URL and returns True if suitable for this IE."""
4972         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
4973
4974     def report_extraction(self, episode_id):
4975         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
4976
4977     def report_config_download(self, episode_id):
4978         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
4979
4980     def report_index_download(self, episode_id):
4981         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
4982
4983     def report_player_url(self, episode_id):
4984         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
4985
4986
4987     def _print_formats(self, formats):
4988         print('Available formats:')
4989         for x in formats:
4990             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
4991
4992
4993     def _real_extract(self, url):
4994         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
4995         if mobj is None:
4996             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4997             return
4998
4999         if mobj.group('shortname'):
5000             if mobj.group('shortname') in ('tds', 'thedailyshow'):
5001                 url = u'http://www.thedailyshow.com/full-episodes/'
5002             else:
5003                 url = u'http://www.colbertnation.com/full-episodes/'
5004             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
5005             assert mobj is not None
5006
5007         if mobj.group('clip'):
5008             if mobj.group('showname') == 'thedailyshow':
5009                 epTitle = mobj.group('tdstitle')
5010             else:
5011                 epTitle = mobj.group('cntitle')
5012             dlNewest = False
5013         else:
5014             dlNewest = not mobj.group('episode')
5015             if dlNewest:
5016                 epTitle = mobj.group('showname')
5017             else:
5018                 epTitle = mobj.group('episode')
5019
5020         req = compat_urllib_request.Request(url)
5021         self.report_extraction(epTitle)
5022         try:
5023             htmlHandle = compat_urllib_request.urlopen(req)
5024             html = htmlHandle.read()
5025         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5026             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
5027             return
5028         if dlNewest:
5029             url = htmlHandle.geturl()
5030             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
5031             if mobj is None:
5032                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
5033                 return
5034             if mobj.group('episode') == '':
5035                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
5036                 return
5037             epTitle = mobj.group('episode')
5038
5039         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
5040
5041         if len(mMovieParams) == 0:
5042             # The Colbert Report embeds the information in a without
5043             # a URL prefix; so extract the alternate reference
5044             # and then add the URL prefix manually.
5045
5046             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
5047             if len(altMovieParams) == 0:
5048                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
5049                 return
5050             else:
5051                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
5052
5053         playerUrl_raw = mMovieParams[0][0]
5054         self.report_player_url(epTitle)
5055         try:
5056             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
5057             playerUrl = urlHandle.geturl()
5058         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5059             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
5060             return
5061
5062         uri = mMovieParams[0][1]
5063         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
5064         self.report_index_download(epTitle)
5065         try:
5066             indexXml = compat_urllib_request.urlopen(indexUrl).read()
5067         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5068             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
5069             return
5070
5071         results = []
5072
5073         idoc = xml.etree.ElementTree.fromstring(indexXml)
5074         itemEls = idoc.findall('.//item')
5075         for itemEl in itemEls:
5076             mediaId = itemEl.findall('./guid')[0].text
5077             shortMediaId = mediaId.split(':')[-1]
5078             showId = mediaId.split(':')[-2].replace('.com', '')
5079             officialTitle = itemEl.findall('./title')[0].text
5080             officialDate = itemEl.findall('./pubDate')[0].text
5081
5082             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
5083                         compat_urllib_parse.urlencode({'uri': mediaId}))
5084             configReq = compat_urllib_request.Request(configUrl)
5085             self.report_config_download(epTitle)
5086             try:
5087                 configXml = compat_urllib_request.urlopen(configReq).read()
5088             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5089                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
5090                 return
5091
5092             cdoc = xml.etree.ElementTree.fromstring(configXml)
5093             turls = []
5094             for rendition in cdoc.findall('.//rendition'):
5095                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
5096                 turls.append(finfo)
5097
5098             if len(turls) == 0:
5099                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
5100                 continue
5101
5102             if self._downloader.params.get('listformats', None):
5103                 self._print_formats([i[0] for i in turls])
5104                 return
5105
5106             # For now, just pick the highest bitrate
5107             format,video_url = turls[-1]
5108
5109             # Get the format arg from the arg stream
5110             req_format = self._downloader.params.get('format', None)
5111
5112             # Select format if we can find one
5113             for f,v in turls:
5114                 if f == req_format:
5115                     format, video_url = f, v
5116                     break
5117
5118             # Patch to download from alternative CDN, which does not
5119             # break on current RTMPDump builds
5120             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
5121             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
5122
5123             if video_url.startswith(broken_cdn):
5124                 video_url = video_url.replace(broken_cdn, better_cdn)
5125
5126             effTitle = showId + u'-' + epTitle
5127             info = {
5128                 'id': shortMediaId,
5129                 'url': video_url,
5130                 'uploader': showId,
5131                 'upload_date': officialDate,
5132                 'title': effTitle,
5133                 'ext': 'mp4',
5134                 'format': format,
5135                 'thumbnail': None,
5136                 'description': officialTitle,
5137                 'player_url': None #playerUrl
5138             }
5139
5140             results.append(info)
5141
5142         return results
5143
5144
5145 class EscapistIE(InfoExtractor):
5146     """Information extractor for The Escapist """
5147
5148     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
5149     IE_NAME = u'escapist'
5150
5151     def report_extraction(self, showName):
5152         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
5153
5154     def report_config_download(self, showName):
5155         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
5156
5157     def _real_extract(self, url):
5158         mobj = re.match(self._VALID_URL, url)
5159         if mobj is None:
5160             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5161             return
5162         showName = mobj.group('showname')
5163         videoId = mobj.group('episode')
5164
5165         self.report_extraction(showName)
5166         try:
5167             webPage = compat_urllib_request.urlopen(url)
5168             webPageBytes = webPage.read()
5169             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
5170             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
5171         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5172             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
5173             return
5174
5175         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
5176         description = unescapeHTML(descMatch.group(1))
5177         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
5178         imgUrl = unescapeHTML(imgMatch.group(1))
5179         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
5180         playerUrl = unescapeHTML(playerUrlMatch.group(1))
5181         configUrlMatch = re.search('config=(.*)$', playerUrl)
5182         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
5183
5184         self.report_config_download(showName)
5185         try:
5186             configJSON = compat_urllib_request.urlopen(configUrl)
5187             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
5188             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
5189         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5190             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
5191             return
5192
5193         # Technically, it's JavaScript, not JSON
5194         configJSON = configJSON.replace("'", '"')
5195
5196         try:
5197             config = json.loads(configJSON)
5198         except (ValueError,) as err:
5199             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
5200             return
5201
5202         playlist = config['playlist']
5203         videoUrl = playlist[1]['url']
5204
5205         info = {
5206             'id': videoId,
5207             'url': videoUrl,
5208             'uploader': showName,
5209             'upload_date': None,
5210             'title': showName,
5211             'ext': 'flv',
5212             'thumbnail': imgUrl,
5213             'description': description,
5214             'player_url': playerUrl,
5215         }
5216
5217         return [info]
5218
5219
5220 class CollegeHumorIE(InfoExtractor):
5221     """Information extractor for collegehumor.com"""
5222
5223     _WORKING = False
5224     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
5225     IE_NAME = u'collegehumor'
5226
5227     def report_manifest(self, video_id):
5228         """Report information extraction."""
5229         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
5230
5231     def report_extraction(self, video_id):
5232         """Report information extraction."""
5233         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
5234
5235     def _real_extract(self, url):
5236         mobj = re.match(self._VALID_URL, url)
5237         if mobj is None:
5238             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5239             return
5240         video_id = mobj.group('videoid')
5241
5242         info = {
5243             'id': video_id,
5244             'uploader': None,
5245             'upload_date': None,
5246         }
5247
5248         self.report_extraction(video_id)
5249         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
5250         try:
5251             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
5252         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5253             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
5254             return
5255
5256         mdoc = xml.etree.ElementTree.fromstring(metaXml)
5257         try:
5258             videoNode = mdoc.findall('./video')[0]
5259             info['description'] = videoNode.findall('./description')[0].text
5260             info['title'] = videoNode.findall('./caption')[0].text
5261             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
5262             manifest_url = videoNode.findall('./file')[0].text
5263         except IndexError:
5264             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
5265             return
5266
5267         manifest_url += '?hdcore=2.10.3'
5268         self.report_manifest(video_id)
5269         try:
5270             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
5271         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5272             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
5273             return
5274
5275         adoc = xml.etree.ElementTree.fromstring(manifestXml)
5276         try:
5277             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
5278             node_id = media_node.attrib['url']
5279             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
5280         except IndexError as err:
5281             self._downloader.trouble(u'\nERROR: Invalid manifest file')
5282             return
5283
5284         url_pr = compat_urllib_parse_urlparse(manifest_url)
5285         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
5286
5287         info['url'] = url
5288         info['ext'] = 'f4f'
5289         return [info]
5290
5291
5292 class XVideosIE(InfoExtractor):
5293     """Information extractor for xvideos.com"""
5294
5295     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
5296     IE_NAME = u'xvideos'
5297
5298     def report_webpage(self, video_id):
5299         """Report information extraction."""
5300         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
5301
5302     def report_extraction(self, video_id):
5303         """Report information extraction."""
5304         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
5305
5306     def _real_extract(self, url):
5307         mobj = re.match(self._VALID_URL, url)
5308         if mobj is None:
5309             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5310             return
5311         video_id = mobj.group(1)
5312
5313         self.report_webpage(video_id)
5314
5315         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
5316         try:
5317             webpage_bytes = compat_urllib_request.urlopen(request).read()
5318             webpage = webpage_bytes.decode('utf-8', 'replace')
5319         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5320             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
5321             return
5322
5323         self.report_extraction(video_id)
5324
5325
5326         # Extract video URL
5327         mobj = re.search(r'flv_url=(.+?)&', webpage)
5328         if mobj is None:
5329             self._downloader.trouble(u'ERROR: unable to extract video url')
5330             return
5331         video_url = compat_urllib_parse.unquote(mobj.group(1))
5332
5333
5334         # Extract title
5335         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
5336         if mobj is None:
5337             self._downloader.trouble(u'ERROR: unable to extract video title')
5338             return
5339         video_title = mobj.group(1)
5340
5341
5342         # Extract video thumbnail
5343         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
5344         if mobj is None:
5345             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
5346             return
5347         video_thumbnail = mobj.group(0)
5348
5349         info = {
5350             'id': video_id,
5351             'url': video_url,
5352             'uploader': None,
5353             'upload_date': None,
5354             'title': video_title,
5355             'ext': 'flv',
5356             'thumbnail': video_thumbnail,
5357             'description': None,
5358         }
5359
5360         return [info]
5361
5362
5363 class SoundcloudIE(InfoExtractor):
5364     """Information extractor for soundcloud.com
5365        To access the media, the uid of the song and a stream token
5366        must be extracted from the page source and the script must make
5367        a request to media.soundcloud.com/crossdomain.xml. Then
5368        the media can be grabbed by requesting from an url composed
5369        of the stream token and uid
5370      """
5371
5372     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
5373     IE_NAME = u'soundcloud'
5374
5375     def __init__(self, downloader=None):
5376         InfoExtractor.__init__(self, downloader)
5377
5378     def report_resolve(self, video_id):
5379         """Report information extraction."""
5380         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
5381
5382     def report_extraction(self, video_id):
5383         """Report information extraction."""
5384         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
5385
5386     def _real_extract(self, url):
5387         mobj = re.match(self._VALID_URL, url)
5388         if mobj is None:
5389             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5390             return
5391
5392         # extract uploader (which is in the url)
5393         uploader = mobj.group(1)
5394         # extract simple title (uploader + slug of song title)
5395         slug_title =  mobj.group(2)
5396         simple_title = uploader + u'-' + slug_title
5397
5398         self.report_resolve('%s/%s' % (uploader, slug_title))
5399
5400         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
5401         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
5402         request = compat_urllib_request.Request(resolv_url)
5403         try:
5404             info_json_bytes = compat_urllib_request.urlopen(request).read()
5405             info_json = info_json_bytes.decode('utf-8')
5406         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5407             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
5408             return
5409
5410         info = json.loads(info_json)
5411         video_id = info['id']
5412         self.report_extraction('%s/%s' % (uploader, slug_title))
5413
5414         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
5415         request = compat_urllib_request.Request(streams_url)
5416         try:
5417             stream_json_bytes = compat_urllib_request.urlopen(request).read()
5418             stream_json = stream_json_bytes.decode('utf-8')
5419         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5420             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
5421             return
5422
5423         streams = json.loads(stream_json)
5424         mediaURL = streams['http_mp3_128_url']
5425
5426         return [{
5427             'id':       info['id'],
5428             'url':      mediaURL,
5429             'uploader': info['user']['username'],
5430             'upload_date':  info['created_at'],
5431             'title':    info['title'],
5432             'ext':      u'mp3',
5433             'description': info['description'],
5434         }]
5435
5436
5437 class InfoQIE(InfoExtractor):
5438     """Information extractor for infoq.com"""
5439
5440     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
5441     IE_NAME = u'infoq'
5442
5443     def report_webpage(self, video_id):
5444         """Report information extraction."""
5445         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
5446
5447     def report_extraction(self, video_id):
5448         """Report information extraction."""
5449         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
5450
5451     def _real_extract(self, url):
5452         mobj = re.match(self._VALID_URL, url)
5453         if mobj is None:
5454             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5455             return
5456
5457         self.report_webpage(url)
5458
5459         request = compat_urllib_request.Request(url)
5460         try:
5461             webpage = compat_urllib_request.urlopen(request).read()
5462         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5463             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
5464             return
5465
5466         self.report_extraction(url)
5467
5468
5469         # Extract video URL
5470         mobj = re.search(r"jsclassref='([^']*)'", webpage)
5471         if mobj is None:
5472             self._downloader.trouble(u'ERROR: unable to extract video url')
5473             return
5474         video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
5475
5476
5477         # Extract title
5478         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
5479         if mobj is None:
5480             self._downloader.trouble(u'ERROR: unable to extract video title')
5481             return
5482         video_title = mobj.group(1).decode('utf-8')
5483
5484         # Extract description
5485         video_description = u'No description available.'
5486         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
5487         if mobj is not None:
5488             video_description = mobj.group(1).decode('utf-8')
5489
5490         video_filename = video_url.split('/')[-1]
5491         video_id, extension = video_filename.split('.')
5492
5493         info = {
5494             'id': video_id,
5495             'url': video_url,
5496             'uploader': None,
5497             'upload_date': None,
5498             'title': video_title,
5499             'ext': extension, # Extension is always(?) mp4, but seems to be flv
5500             'thumbnail': None,
5501             'description': video_description,
5502         }
5503
5504         return [info]
5505
5506 class MixcloudIE(InfoExtractor):
5507     """Information extractor for www.mixcloud.com"""
5508
5509     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
5510     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
5511     IE_NAME = u'mixcloud'
5512
5513     def __init__(self, downloader=None):
5514         InfoExtractor.__init__(self, downloader)
5515
5516     def report_download_json(self, file_id):
5517         """Report JSON download."""
5518         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
5519
5520     def report_extraction(self, file_id):
5521         """Report information extraction."""
5522         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
5523
5524     def get_urls(self, jsonData, fmt, bitrate='best'):
5525         """Get urls from 'audio_formats' section in json"""
5526         file_url = None
5527         try:
5528             bitrate_list = jsonData[fmt]
5529             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
5530                 bitrate = max(bitrate_list) # select highest
5531
5532             url_list = jsonData[fmt][bitrate]
5533         except TypeError: # we have no bitrate info.
5534             url_list = jsonData[fmt]
5535         return url_list
5536
5537     def check_urls(self, url_list):
5538         """Returns 1st active url from list"""
5539         for url in url_list:
5540             try:
5541                 compat_urllib_request.urlopen(url)
5542                 return url
5543             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5544                 url = None
5545
5546         return None
5547
5548     def _print_formats(self, formats):
5549         print('Available formats:')
5550         for fmt in formats.keys():
5551             for b in formats[fmt]:
5552                 try:
5553                     ext = formats[fmt][b][0]
5554                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
5555                 except TypeError: # we have no bitrate info
5556                     ext = formats[fmt][0]
5557                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
5558                     break
5559
5560     def _real_extract(self, url):
5561         mobj = re.match(self._VALID_URL, url)
5562         if mobj is None:
5563             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5564             return
5565         # extract uploader & filename from url
5566         uploader = mobj.group(1).decode('utf-8')
5567         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
5568
5569         # construct API request
5570         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
5571         # retrieve .json file with links to files
5572         request = compat_urllib_request.Request(file_url)
5573         try:
5574             self.report_download_json(file_url)
5575             jsonData = compat_urllib_request.urlopen(request).read()
5576         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5577             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
5578             return
5579
5580         # parse JSON
5581         json_data = json.loads(jsonData)
5582         player_url = json_data['player_swf_url']
5583         formats = dict(json_data['audio_formats'])
5584
5585         req_format = self._downloader.params.get('format', None)
5586         bitrate = None
5587
5588         if self._downloader.params.get('listformats', None):
5589             self._print_formats(formats)
5590             return
5591
5592         if req_format is None or req_format == 'best':
5593             for format_param in formats.keys():
5594                 url_list = self.get_urls(formats, format_param)
5595                 # check urls
5596                 file_url = self.check_urls(url_list)
5597                 if file_url is not None:
5598                     break # got it!
5599         else:
5600             if req_format not in formats:
5601                 self._downloader.trouble(u'ERROR: format is not available')
5602                 return
5603
5604             url_list = self.get_urls(formats, req_format)
5605             file_url = self.check_urls(url_list)
5606             format_param = req_format
5607
5608         return [{
5609             'id': file_id.decode('utf-8'),
5610             'url': file_url.decode('utf-8'),
5611             'uploader': uploader.decode('utf-8'),
5612             'upload_date': None,
5613             'title': json_data['name'],
5614             'ext': file_url.split('.')[-1].decode('utf-8'),
5615             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
5616             'thumbnail': json_data['thumbnail_url'],
5617             'description': json_data['description'],
5618             'player_url': player_url.decode('utf-8'),
5619         }]
5620
5621 class StanfordOpenClassroomIE(InfoExtractor):
5622     """Information extractor for Stanford's Open ClassRoom"""
5623
5624     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
5625     IE_NAME = u'stanfordoc'
5626
5627     def report_download_webpage(self, objid):
5628         """Report information extraction."""
5629         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
5630
5631     def report_extraction(self, video_id):
5632         """Report information extraction."""
5633         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
5634
5635     def _real_extract(self, url):
5636         mobj = re.match(self._VALID_URL, url)
5637         if mobj is None:
5638             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5639             return
5640
5641         if mobj.group('course') and mobj.group('video'): # A specific video
5642             course = mobj.group('course')
5643             video = mobj.group('video')
5644             info = {
5645                 'id': course + '_' + video,
5646                 'uploader': None,
5647                 'upload_date': None,
5648             }
5649
5650             self.report_extraction(info['id'])
5651             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
5652             xmlUrl = baseUrl + video + '.xml'
5653             try:
5654                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
5655             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5656                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
5657                 return
5658             mdoc = xml.etree.ElementTree.fromstring(metaXml)
5659             try:
5660                 info['title'] = mdoc.findall('./title')[0].text
5661                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
5662             except IndexError:
5663                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
5664                 return
5665             info['ext'] = info['url'].rpartition('.')[2]
5666             return [info]
5667         elif mobj.group('course'): # A course page
5668             course = mobj.group('course')
5669             info = {
5670                 'id': course,
5671                 'type': 'playlist',
5672                 'uploader': None,
5673                 'upload_date': None,
5674             }
5675
5676             self.report_download_webpage(info['id'])
5677             try:
5678                 coursepage = compat_urllib_request.urlopen(url).read()
5679             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5680                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
5681                 return
5682
5683             m = re.search('<h1>([^<]+)</h1>', coursepage)
5684             if m:
5685                 info['title'] = unescapeHTML(m.group(1))
5686             else:
5687                 info['title'] = info['id']
5688
5689             m = re.search('<description>([^<]+)</description>', coursepage)
5690             if m:
5691                 info['description'] = unescapeHTML(m.group(1))
5692
5693             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
5694             info['list'] = [
5695                 {
5696                     'type': 'reference',
5697                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
5698                 }
5699                     for vpage in links]
5700             results = []
5701             for entry in info['list']:
5702                 assert entry['type'] == 'reference'
5703                 results += self.extract(entry['url'])
5704             return results
5705
5706         else: # Root page
5707             info = {
5708                 'id': 'Stanford OpenClassroom',
5709                 'type': 'playlist',
5710                 'uploader': None,
5711                 'upload_date': None,
5712             }
5713
5714             self.report_download_webpage(info['id'])
5715             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
5716             try:
5717                 rootpage = compat_urllib_request.urlopen(rootURL).read()
5718             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5719                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
5720                 return
5721
5722             info['title'] = info['id']
5723
5724             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
5725             info['list'] = [
5726                 {
5727                     'type': 'reference',
5728                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
5729                 }
5730                     for cpage in links]
5731
5732             results = []
5733             for entry in info['list']:
5734                 assert entry['type'] == 'reference'
5735                 results += self.extract(entry['url'])
5736             return results
5737
5738 class MTVIE(InfoExtractor):
5739     """Information extractor for MTV.com"""
5740
5741     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
5742     IE_NAME = u'mtv'
5743
5744     def report_webpage(self, video_id):
5745         """Report information extraction."""
5746         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
5747
5748     def report_extraction(self, video_id):
5749         """Report information extraction."""
5750         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
5751
5752     def _real_extract(self, url):
5753         mobj = re.match(self._VALID_URL, url)
5754         if mobj is None:
5755             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5756             return
5757         if not mobj.group('proto'):
5758             url = 'http://' + url
5759         video_id = mobj.group('videoid')
5760         self.report_webpage(video_id)
5761
5762         request = compat_urllib_request.Request(url)
5763         try:
5764             webpage = compat_urllib_request.urlopen(request).read()
5765         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5766             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
5767             return
5768
5769         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
5770         if mobj is None:
5771             self._downloader.trouble(u'ERROR: unable to extract song name')
5772             return
5773         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
5774         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
5775         if mobj is None:
5776             self._downloader.trouble(u'ERROR: unable to extract performer')
5777             return
5778         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
5779         video_title = performer + ' - ' + song_name
5780
5781         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
5782         if mobj is None:
5783             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
5784             return
5785         mtvn_uri = mobj.group(1)
5786
5787         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
5788         if mobj is None:
5789             self._downloader.trouble(u'ERROR: unable to extract content id')
5790             return
5791         content_id = mobj.group(1)
5792
5793         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
5794         self.report_extraction(video_id)
5795         request = compat_urllib_request.Request(videogen_url)
5796         try:
5797             metadataXml = compat_urllib_request.urlopen(request).read()
5798         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5799             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
5800             return
5801
5802         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
5803         renditions = mdoc.findall('.//rendition')
5804
5805         # For now, always pick the highest quality.
5806         rendition = renditions[-1]
5807
5808         try:
5809             _,_,ext = rendition.attrib['type'].partition('/')
5810             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
5811             video_url = rendition.find('./src').text
5812         except KeyError:
5813             self._downloader.trouble('Invalid rendition field.')
5814             return
5815
5816         info = {
5817             'id': video_id,
5818             'url': video_url,
5819             'uploader': performer,
5820             'upload_date': None,
5821             'title': video_title,
5822             'ext': ext,
5823             'format': format,
5824         }
5825
5826         return [info]
5827
5828
5829 class YoukuIE(InfoExtractor):
5830
5831     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
5832     IE_NAME = u'Youku'
5833
5834     def __init__(self, downloader=None):
5835         InfoExtractor.__init__(self, downloader)
5836
5837     def report_download_webpage(self, file_id):
5838         """Report webpage download."""
5839         self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
5840
5841     def report_extraction(self, file_id):
5842         """Report information extraction."""
5843         self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
5844
5845     def _gen_sid(self):
5846         nowTime = int(time.time() * 1000)
5847         random1 = random.randint(1000,1998)
5848         random2 = random.randint(1000,9999)
5849
5850         return "%d%d%d" %(nowTime,random1,random2)
5851
5852     def _get_file_ID_mix_string(self, seed):
5853         mixed = []
5854         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
5855         seed = float(seed)
5856         for i in range(len(source)):
5857             seed  =  (seed * 211 + 30031 ) % 65536
5858             index  =  math.floor(seed / 65536 * len(source) )
5859             mixed.append(source[int(index)])
5860             source.remove(source[int(index)])
5861         #return ''.join(mixed)
5862         return mixed
5863
5864     def _get_file_id(self, fileId, seed):
5865         mixed = self._get_file_ID_mix_string(seed)
5866         ids = fileId.split('*')
5867         realId = []
5868         for ch in ids:
5869             if ch:
5870                 realId.append(mixed[int(ch)])
5871         return ''.join(realId)
5872
5873     def _real_extract(self, url):
5874         mobj = re.match(self._VALID_URL, url)
5875         if mobj is None:
5876             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5877             return
5878         video_id = mobj.group('ID')
5879
5880         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
5881
5882         request = compat_urllib_request.Request(info_url, None, std_headers)
5883         try:
5884             self.report_download_webpage(video_id)
5885             jsondata = compat_urllib_request.urlopen(request).read()
5886         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5887             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
5888             return
5889
5890         self.report_extraction(video_id)
5891         try:
5892             jsonstr = jsondata.decode('utf-8')
5893             config = json.loads(jsonstr)
5894
5895             video_title =  config['data'][0]['title']
5896             seed = config['data'][0]['seed']
5897
5898             format = self._downloader.params.get('format', None)
5899             supported_format = list(config['data'][0]['streamfileids'].keys())
5900
5901             if format is None or format == 'best':
5902                 if 'hd2' in supported_format:
5903                     format = 'hd2'
5904                 else:
5905                     format = 'flv'
5906                 ext = u'flv'
5907             elif format == 'worst':
5908                 format = 'mp4'
5909                 ext = u'mp4'
5910             else:
5911                 format = 'flv'
5912                 ext = u'flv'
5913
5914
5915             fileid = config['data'][0]['streamfileids'][format]
5916             keys = [s['k'] for s in config['data'][0]['segs'][format]]
5917         except (UnicodeDecodeError, ValueError, KeyError):
5918             self._downloader.trouble(u'ERROR: unable to extract info section')
5919             return
5920
5921         files_info=[]
5922         sid = self._gen_sid()
5923         fileid = self._get_file_id(fileid, seed)
5924
5925         #column 8,9 of fileid represent the segment number
5926         #fileid[7:9] should be changed
5927         for index, key in enumerate(keys):
5928
5929             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
5930             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
5931
5932             info = {
5933                 'id': '%s_part%02d' % (video_id, index),
5934                 'url': download_url,
5935                 'uploader': None,
5936                 'upload_date': None,
5937                 'title': video_title,
5938                 'ext': ext,
5939             }
5940             files_info.append(info)
5941
5942         return files_info
5943
5944
5945 class XNXXIE(InfoExtractor):
5946     """Information extractor for xnxx.com"""
5947
5948     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
5949     IE_NAME = u'xnxx'
5950     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
5951     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
5952     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
5953
5954     def report_webpage(self, video_id):
5955         """Report information extraction"""
5956         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
5957
5958     def report_extraction(self, video_id):
5959         """Report information extraction"""
5960         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
5961
5962     def _real_extract(self, url):
5963         mobj = re.match(self._VALID_URL, url)
5964         if mobj is None:
5965             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5966             return
5967         video_id = mobj.group(1)
5968
5969         self.report_webpage(video_id)
5970
5971         # Get webpage content
5972         try:
5973             webpage_bytes = compat_urllib_request.urlopen(url).read()
5974             webpage = webpage_bytes.decode('utf-8')
5975         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5976             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
5977             return
5978
5979         result = re.search(self.VIDEO_URL_RE, webpage)
5980         if result is None:
5981             self._downloader.trouble(u'ERROR: unable to extract video url')
5982             return
5983         video_url = compat_urllib_parse.unquote(result.group(1))
5984
5985         result = re.search(self.VIDEO_TITLE_RE, webpage)
5986         if result is None:
5987             self._downloader.trouble(u'ERROR: unable to extract video title')
5988             return
5989         video_title = result.group(1)
5990
5991         result = re.search(self.VIDEO_THUMB_RE, webpage)
5992         if result is None:
5993             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
5994             return
5995         video_thumbnail = result.group(1)
5996
5997         return [{
5998             'id': video_id,
5999             'url': video_url,
6000             'uploader': None,
6001             'upload_date': None,
6002             'title': video_title,
6003             'ext': 'flv',
6004             'thumbnail': video_thumbnail,
6005             'description': None,
6006         }]
6007
6008
6009 class GooglePlusIE(InfoExtractor):
6010     """Information extractor for plus.google.com."""
6011
6012     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
6013     IE_NAME = u'plus.google'
6014
6015     def __init__(self, downloader=None):
6016         InfoExtractor.__init__(self, downloader)
6017
6018     def report_extract_entry(self, url):
6019         """Report downloading extry"""
6020         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
6021
6022     def report_date(self, upload_date):
6023         """Report downloading extry"""
6024         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
6025
6026     def report_uploader(self, uploader):
6027         """Report downloading extry"""
6028         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
6029
6030     def report_title(self, video_title):
6031         """Report downloading extry"""
6032         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
6033
6034     def report_extract_vid_page(self, video_page):
6035         """Report information extraction."""
6036         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
6037
6038     def _real_extract(self, url):
6039         # Extract id from URL
6040         mobj = re.match(self._VALID_URL, url)
6041         if mobj is None:
6042             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
6043             return
6044
6045         post_url = mobj.group(0)
6046         video_id = mobj.group(1)
6047
6048         video_extension = 'flv'
6049
6050         # Step 1, Retrieve post webpage to extract further information
6051         self.report_extract_entry(post_url)
6052         request = compat_urllib_request.Request(post_url)
6053         try:
6054             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
6055         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
6056             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
6057             return
6058
6059         # Extract update date
6060         upload_date = None
6061         pattern = 'title="Timestamp">(.*?)</a>'
6062         mobj = re.search(pattern, webpage)
6063         if mobj:
6064             upload_date = mobj.group(1)
6065             # Convert timestring to a format suitable for filename
6066             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
6067             upload_date = upload_date.strftime('%Y%m%d')
6068         self.report_date(upload_date)
6069
6070         # Extract uploader
6071         uploader = None
6072         pattern = r'rel\="author".*?>(.*?)</a>'
6073         mobj = re.search(pattern, webpage)
6074         if mobj:
6075             uploader = mobj.group(1)
6076         self.report_uploader(uploader)
6077
6078         # Extract title
6079         # Get the first line for title
6080         video_title = u'NA'
6081         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
6082         mobj = re.search(pattern, webpage)
6083         if mobj:
6084             video_title = mobj.group(1)
6085         self.report_title(video_title)
6086
6087         # Step 2, Stimulate clicking the image box to launch video
6088         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
6089         mobj = re.search(pattern, webpage)
6090         if mobj is None:
6091             self._downloader.trouble(u'ERROR: unable to extract video page URL')
6092
6093         video_page = mobj.group(1)
6094         request = compat_urllib_request.Request(video_page)
6095         try:
6096             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
6097         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
6098             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
6099             return
6100         self.report_extract_vid_page(video_page)
6101
6102
6103         # Extract video links on video page
6104         """Extract video links of all sizes"""
6105         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
6106         mobj = re.findall(pattern, webpage)
6107         if len(mobj) == 0:
6108             self._downloader.trouble(u'ERROR: unable to extract video links')
6109
6110         # Sort in resolution
6111         links = sorted(mobj)
6112
6113         # Choose the lowest of the sort, i.e. highest resolution
6114         video_url = links[-1]
6115         # Only get the url. The resolution part in the tuple has no use anymore
6116         video_url = video_url[-1]
6117         # Treat escaped \u0026 style hex
6118         try:
6119             video_url = video_url.decode("unicode_escape")
6120         except AttributeError: # Python 3
6121             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
6122
6123
6124         return [{
6125             'id':       video_id,
6126             'url':      video_url,
6127             'uploader': uploader,
6128             'upload_date':  upload_date,
6129             'title':    video_title,
6130             'ext':      video_extension,
6131         }]
6132
6133 class NBAIE(InfoExtractor):
6134     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
6135     IE_NAME = u'nba'
6136
6137     def report_extraction(self, video_id):
6138         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
6139
6140     def _real_extract(self, url):
6141         mobj = re.match(self._VALID_URL, url)
6142         if mobj is None:
6143             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6144             return
6145
6146         video_id = mobj.group(1)
6147         if video_id.endswith('/index.html'):
6148             video_id = video_id[:-len('/index.html')]
6149
6150         self.report_extraction(video_id)
6151         try:
6152             urlh = compat_urllib_request.urlopen(url)
6153             webpage_bytes = urlh.read()
6154             webpage = webpage_bytes.decode('utf-8', 'ignore')
6155         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
6156             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
6157             return
6158
6159         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
6160         def _findProp(rexp, default=None):
6161             m = re.search(rexp, webpage)
6162             if m:
6163                 return unescapeHTML(m.group(1))
6164             else:
6165                 return default
6166
6167         shortened_video_id = video_id.rpartition('/')[2]
6168         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
6169         info = {
6170             'id': shortened_video_id,
6171             'url': video_url,
6172             'ext': 'mp4',
6173             'title': title,
6174             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
6175             'description': _findProp(r'<div class="description">(.*?)</h1>'),
6176         }
6177         return [info]
6178
6179 class JustinTVIE(InfoExtractor):
6180     """Information extractor for justin.tv and twitch.tv"""
6181     # TODO: One broadcast may be split into multiple videos. The key
6182     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
6183     # starts at 1 and increases. Can we treat all parts as one video?
6184
6185     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
6186         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
6187     _JUSTIN_PAGE_LIMIT = 100
6188     IE_NAME = u'justin.tv'
6189
6190     def report_extraction(self, file_id):
6191         """Report information extraction."""
6192         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
6193
6194     def report_download_page(self, channel, offset):
6195         """Report attempt to download a single page of videos."""
6196         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
6197                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
6198
6199     # Return count of items, list of *valid* items
6200     def _parse_page(self, url):
6201         try:
6202             urlh = compat_urllib_request.urlopen(url)
6203             webpage_bytes = urlh.read()
6204             webpage = webpage_bytes.decode('utf-8', 'ignore')
6205         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
6206             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
6207             return
6208
6209         response = json.loads(webpage)
6210         info = []
6211         for clip in response:
6212             video_url = clip['video_file_url']
6213             if video_url:
6214                 video_extension = os.path.splitext(video_url)[1][1:]
6215                 video_date = re.sub('-', '', clip['created_on'][:10])
6216                 info.append({
6217                     'id': clip['id'],
6218                     'url': video_url,
6219                     'title': clip['title'],
6220                     'uploader': clip.get('user_id', clip.get('channel_id')),
6221                     'upload_date': video_date,
6222                     'ext': video_extension,
6223                 })
6224         return (len(response), info)
6225
6226     def _real_extract(self, url):
6227         mobj = re.match(self._VALID_URL, url)
6228         if mobj is None:
6229             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6230             return
6231
6232         api = 'http://api.justin.tv'
6233         video_id = mobj.group(mobj.lastindex)
6234         paged = False
6235         if mobj.lastindex == 1:
6236             paged = True
6237             api += '/channel/archives/%s.json'
6238         else:
6239             api += '/clip/show/%s.json'
6240         api = api % (video_id,)
6241
6242         self.report_extraction(video_id)
6243
6244         info = []
6245         offset = 0
6246         limit = self._JUSTIN_PAGE_LIMIT
6247         while True:
6248             if paged:
6249                 self.report_download_page(video_id, offset)
6250             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
6251             page_count, page_info = self._parse_page(page_url)
6252             info.extend(page_info)
6253             if not paged or page_count != limit:
6254                 break
6255             offset += limit
6256         return info
6257
6258 class FunnyOrDieIE(InfoExtractor):
6259     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
6260     IE_NAME = u'FunnyOrDie'
6261
6262     def report_extraction(self, video_id):
6263         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
6264
6265     def _real_extract(self, url):
6266         mobj = re.match(self._VALID_URL, url)
6267         if mobj is None:
6268             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6269             return
6270
6271         video_id = mobj.group('id')
6272         self.report_extraction(video_id)
6273         try:
6274             urlh = compat_urllib_request.urlopen(url)
6275             webpage_bytes = urlh.read()
6276             webpage = webpage_bytes.decode('utf-8', 'ignore')
6277         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
6278             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
6279             return
6280
6281         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
6282         if not m:
6283             self._downloader.trouble(u'ERROR: unable to find video information')
6284         video_url = unescapeHTML(m.group('url'))
6285
6286         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
6287         if not m:
6288             self._downloader.trouble(u'Cannot find video title')
6289         title = unescapeHTML(m.group('title'))
6290
6291         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
6292         if m:
6293             desc = unescapeHTML(m.group('desc'))
6294         else:
6295             desc = None
6296
6297         info = {
6298             'id': video_id,
6299             'url': video_url,
6300             'ext': 'mp4',
6301             'title': title,
6302             'description': desc,
6303         }
6304         return [info]
6305
6306 class TweetReelIE(InfoExtractor):
6307     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
6308
6309     def report_extraction(self, video_id):
6310         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
6311
6312     def _real_extract(self, url):
6313         mobj = re.match(self._VALID_URL, url)
6314         if mobj is None:
6315             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6316             return
6317
6318         video_id = mobj.group('id')
6319         self.report_extraction(video_id)
6320         try:
6321             urlh = compat_urllib_request.urlopen(url)
6322             webpage_bytes = urlh.read()
6323             webpage = webpage_bytes.decode('utf-8', 'ignore')
6324         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
6325             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
6326             return
6327
6328         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
6329         if not m:
6330             self._downloader.trouble(u'ERROR: Cannot find status ID')
6331         status_id = m.group(1)
6332
6333         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
6334         if not m:
6335             self._downloader.trouble(u'WARNING: Cannot find description')
6336         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
6337
6338         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
6339         if not m:
6340             self._downloader.trouble(u'ERROR: Cannot find uploader')
6341         uploader = unescapeHTML(m.group('uploader'))
6342         uploader_id = unescapeHTML(m.group('uploader_id'))
6343
6344         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
6345         if not m:
6346             self._downloader.trouble(u'ERROR: Cannot find upload date')
6347         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
6348
6349         title = desc
6350         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
6351
6352         info = {
6353             'id': video_id,
6354             'url': video_url,
6355             'ext': 'mov',
6356             'title': title,
6357             'description': desc,
6358             'uploader': uploader,
6359             'uploader_id': uploader_id,
6360             'internal_id': status_id,
6361             'upload_date': upload_date
6362         }
6363         return [info]