youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import datetime
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import email.utils
  13 import xml.etree.ElementTree
  14 import random
  15 import math
  16
  17 from .utils import *
  18
  19
  20 class InfoExtractor(object):
  21     """Information Extractor class.
  22
  23     Information extractors are the classes that, given a URL, extract
  24     information about the video (or videos) the URL refers to. This
  25     information includes the real video URL, the video title, author and
  26     others. The information is stored in a dictionary which is then
  27     passed to the FileDownloader. The FileDownloader processes this
  28     information possibly downloading the video to the file system, among
  29     other possible outcomes.
  30
  31     The dictionaries must include the following fields:
  32
  33     id:             Video identifier.
  34     url:            Final video URL.
  35     title:          Video title, unescaped.
  36     ext:            Video filename extension.
  37     uploader:       Full name of the video uploader.
  38     upload_date:    Video upload date (YYYYMMDD).
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader_id:    Nickname or id of the video uploader.
  46     player_url:     SWF Player URL (used for rtmpdump).
  47     subtitles:      The .srt file contents.
  48     urlhandle:      [internal] The urlHandle to be used to download the file,
  49                     like returned by urllib.request.urlopen
  50
  51     The fields should all be Unicode strings.
  52
  53     Subclasses of this one should re-define the _real_initialize() and
  54     _real_extract() methods and define a _VALID_URL regexp.
  55     Probably, they should also be added to the list of extractors.
  56
  57     _real_extract() must return a *list* of information dictionaries as
  58     described above.
  59
  60     Finally, the _WORKING attribute should be set to False for broken IEs
  61     in order to warn the users and skip the tests.
  62     """
  63
  64     _ready = False
  65     _downloader = None
  66     _WORKING = True
  67
  68     def __init__(self, downloader=None):
  69         """Constructor. Receives an optional downloader."""
  70         self._ready = False
  71         self.set_downloader(downloader)
  72
  73     def suitable(self, url):
  74         """Receives a URL and returns True if suitable for this IE."""
  75         return re.match(self._VALID_URL, url) is not None
  76
  77     def working(self):
  78         """Getter method for _WORKING."""
  79         return self._WORKING
  80
  81     def initialize(self):
  82         """Initializes an instance (authentication, etc)."""
  83         if not self._ready:
  84             self._real_initialize()
  85             self._ready = True
  86
  87     def extract(self, url):
  88         """Extracts URL information and returns it in list of dicts."""
  89         self.initialize()
  90         return self._real_extract(url)
  91
  92     def set_downloader(self, downloader):
  93         """Sets the downloader for this IE."""
  94         self._downloader = downloader
  95
  96     def _real_initialize(self):
  97         """Real initialization process. Redefine in subclasses."""
  98         pass
  99
 100     def _real_extract(self, url):
 101         """Real extraction process. Redefine in subclasses."""
 102         pass
 103
 104     @property
 105     def IE_NAME(self):
 106         return type(self).__name__[:-2]
 107
 108 class YoutubeIE(InfoExtractor):
 109     """Information extractor for youtube.com."""
 110
 111     _VALID_URL = r"""^
 112                      (
 113                          (?:https?://)?                                       # http(s):// (optional)
 114                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 115                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 116                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 117                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 118                          (?:                                                  # the various things that can precede the ID:
 119                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 120                              |(?:                                             # or the v= param in all its forms
 121                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 122                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 123                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 124                                  v=
 125                              )
 126                          )?                                                   # optional -> youtube.com/xxxx is OK
 127                      )?                                                       # all until now is optional -> you can pass the naked ID
 128                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 129                      (?(1).+)?                                                # if we found the ID, everything can follow
 130                      $"""
 131     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 132     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 133     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 134     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 135     _NETRC_MACHINE = 'youtube'
 136     # Listed in order of quality
 137     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 138     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 139     _video_extensions = {
 140         '13': '3gp',
 141         '17': 'mp4',
 142         '18': 'mp4',
 143         '22': 'mp4',
 144         '37': 'mp4',
 145         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 146         '43': 'webm',
 147         '44': 'webm',
 148         '45': 'webm',
 149         '46': 'webm',
 150     }
 151     _video_dimensions = {
 152         '5': '240x400',
 153         '6': '???',
 154         '13': '???',
 155         '17': '144x176',
 156         '18': '360x640',
 157         '22': '720x1280',
 158         '34': '360x640',
 159         '35': '480x854',
 160         '37': '1080x1920',
 161         '38': '3072x4096',
 162         '43': '360x640',
 163         '44': '480x854',
 164         '45': '720x1280',
 165         '46': '1080x1920',
 166     }
 167     IE_NAME = u'youtube'
 168
 169     def suitable(self, url):
 170         """Receives a URL and returns True if suitable for this IE."""
 171         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 172
 173     def report_lang(self):
 174         """Report attempt to set language."""
 175         self._downloader.to_screen(u'[youtube] Setting language')
 176
 177     def report_login(self):
 178         """Report attempt to log in."""
 179         self._downloader.to_screen(u'[youtube] Logging in')
 180
 181     def report_age_confirmation(self):
 182         """Report attempt to confirm age."""
 183         self._downloader.to_screen(u'[youtube] Confirming age')
 184
 185     def report_video_webpage_download(self, video_id):
 186         """Report attempt to download video webpage."""
 187         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 188
 189     def report_video_info_webpage_download(self, video_id):
 190         """Report attempt to download video info webpage."""
 191         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 192
 193     def report_video_subtitles_download(self, video_id):
 194         """Report attempt to download video info webpage."""
 195         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 196
 197     def report_information_extraction(self, video_id):
 198         """Report attempt to extract video information."""
 199         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 200
 201     def report_unavailable_format(self, video_id, format):
 202         """Report extracted video URL."""
 203         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 204
 205     def report_rtmp_download(self):
 206         """Indicate the download will use the RTMP protocol."""
 207         self._downloader.to_screen(u'[youtube] RTMP download detected')
 208
 209     def _closed_captions_xml_to_srt(self, xml_string):
 210         srt = ''
 211         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 212         # TODO parse xml instead of regex
 213         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 214             if not dur: dur = '4'
 215             start = float(start)
 216             end = start + float(dur)
 217             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 218             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 219             caption = unescapeHTML(caption)
 220             caption = unescapeHTML(caption) # double cycle, intentional
 221             srt += str(n+1) + '\n'
 222             srt += start + ' --> ' + end + '\n'
 223             srt += caption + '\n\n'
 224         return srt
 225
 226     def _extract_subtitles(self, video_id):
 227         self.report_video_subtitles_download(video_id)
 228         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 229         try:
 230             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 231         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 232             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 233         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 234         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 235         if not srt_lang_list:
 236             return (u'WARNING: video has no closed captions', None)
 237         if self._downloader.params.get('subtitleslang', False):
 238             srt_lang = self._downloader.params.get('subtitleslang')
 239         elif 'en' in srt_lang_list:
 240             srt_lang = 'en'
 241         else:
 242             srt_lang = list(srt_lang_list.keys())[0]
 243         if not srt_lang in srt_lang_list:
 244             return (u'WARNING: no closed captions found in the specified language', None)
 245         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 246         try:
 247             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 248         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 249             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 250         if not srt_xml:
 251             return (u'WARNING: unable to download video subtitles', None)
 252         return (None, self._closed_captions_xml_to_srt(srt_xml))
 253
 254     def _print_formats(self, formats):
 255         print('Available formats:')
 256         for x in formats:
 257             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 258
 259     def _real_initialize(self):
 260         if self._downloader is None:
 261             return
 262
 263         username = None
 264         password = None
 265         downloader_params = self._downloader.params
 266
 267         # Attempt to use provided username and password or .netrc data
 268         if downloader_params.get('username', None) is not None:
 269             username = downloader_params['username']
 270             password = downloader_params['password']
 271         elif downloader_params.get('usenetrc', False):
 272             try:
 273                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 274                 if info is not None:
 275                     username = info[0]
 276                     password = info[2]
 277                 else:
 278                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 279             except (IOError, netrc.NetrcParseError) as err:
 280                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 281                 return
 282
 283         # Set language
 284         request = compat_urllib_request.Request(self._LANG_URL)
 285         try:
 286             self.report_lang()
 287             compat_urllib_request.urlopen(request).read()
 288         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 289             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 290             return
 291
 292         # No authentication to be performed
 293         if username is None:
 294             return
 295
 296         # Log in
 297         login_form = {
 298                 'current_form': 'loginForm',
 299                 'next':     '/',
 300                 'action_login': 'Log In',
 301                 'username': username,
 302                 'password': password,
 303                 }
 304         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 305         try:
 306             self.report_login()
 307             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 308             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 309                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 310                 return
 311         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 312             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 313             return
 314
 315         # Confirm age
 316         age_form = {
 317                 'next_url':     '/',
 318                 'action_confirm':   'Confirm',
 319                 }
 320         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 321         try:
 322             self.report_age_confirmation()
 323             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 324         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 325             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 326             return
 327
 328     def _extract_id(self, url):
 329         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 330         if mobj is None:
 331             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 332             return
 333         video_id = mobj.group(2)
 334         return video_id
 335
 336     def _real_extract(self, url):
 337         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 338         mobj = re.search(self._NEXT_URL_RE, url)
 339         if mobj:
 340             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 341         video_id = self._extract_id(url)
 342
 343         # Get video webpage
 344         self.report_video_webpage_download(video_id)
 345         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 346         request = compat_urllib_request.Request(url)
 347         try:
 348             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 349         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 350             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 351             return
 352
 353         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 354
 355         # Attempt to extract SWF player URL
 356         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 357         if mobj is not None:
 358             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 359         else:
 360             player_url = None
 361
 362         # Get video info
 363         self.report_video_info_webpage_download(video_id)
 364         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 365             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 366                     % (video_id, el_type))
 367             request = compat_urllib_request.Request(video_info_url)
 368             try:
 369                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 370                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 371                 video_info = compat_parse_qs(video_info_webpage)
 372                 if 'token' in video_info:
 373                     break
 374             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 375                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 376                 return
 377         if 'token' not in video_info:
 378             if 'reason' in video_info:
 379                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 380             else:
 381                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 382             return
 383
 384         # Check for "rental" videos
 385         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 386             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 387             return
 388
 389         # Start extracting information
 390         self.report_information_extraction(video_id)
 391
 392         # uploader
 393         if 'author' not in video_info:
 394             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 395             return
 396         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 397
 398         # uploader_id
 399         video_uploader_id = None
 400         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
 401         if mobj is not None:
 402             video_uploader_id = mobj.group(1)
 403         else:
 404             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 405
 406         # title
 407         if 'title' not in video_info:
 408             self._downloader.trouble(u'ERROR: unable to extract video title')
 409             return
 410         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 411
 412         # thumbnail image
 413         if 'thumbnail_url' not in video_info:
 414             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 415             video_thumbnail = ''
 416         else:   # don't panic if we can't find it
 417             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 418
 419         # upload date
 420         upload_date = None
 421         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 422         if mobj is not None:
 423             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 424             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 425             for expression in format_expressions:
 426                 try:
 427                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 428                 except:
 429                     pass
 430
 431         # description
 432         video_description = get_element_by_id("eow-description", video_webpage)
 433         if video_description:
 434             video_description = clean_html(video_description)
 435         else:
 436             video_description = ''
 437
 438         # closed captions
 439         video_subtitles = None
 440         if self._downloader.params.get('writesubtitles', False):
 441             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 442             if srt_error:
 443                 self._downloader.trouble(srt_error)
 444
 445         if 'length_seconds' not in video_info:
 446             self._downloader.trouble(u'WARNING: unable to extract video duration')
 447             video_duration = ''
 448         else:
 449             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 450
 451         # token
 452         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 453
 454         # Decide which formats to download
 455         req_format = self._downloader.params.get('format', None)
 456
 457         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 458             self.report_rtmp_download()
 459             video_url_list = [(None, video_info['conn'][0])]
 460         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 461             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 462             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 463             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 464             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 465
 466             format_limit = self._downloader.params.get('format_limit', None)
 467             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 468             if format_limit is not None and format_limit in available_formats:
 469                 format_list = available_formats[available_formats.index(format_limit):]
 470             else:
 471                 format_list = available_formats
 472             existing_formats = [x for x in format_list if x in url_map]
 473             if len(existing_formats) == 0:
 474                 self._downloader.trouble(u'ERROR: no known formats available for video')
 475                 return
 476             if self._downloader.params.get('listformats', None):
 477                 self._print_formats(existing_formats)
 478                 return
 479             if req_format is None or req_format == 'best':
 480                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 481             elif req_format == 'worst':
 482                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 483             elif req_format in ('-1', 'all'):
 484                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 485             else:
 486                 # Specific formats. We pick the first in a slash-delimeted sequence.
 487                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 488                 req_formats = req_format.split('/')
 489                 video_url_list = None
 490                 for rf in req_formats:
 491                     if rf in url_map:
 492                         video_url_list = [(rf, url_map[rf])]
 493                         break
 494                 if video_url_list is None:
 495                     self._downloader.trouble(u'ERROR: requested format not available')
 496                     return
 497         else:
 498             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 499             return
 500
 501         results = []
 502         for format_param, video_real_url in video_url_list:
 503             # Extension
 504             video_extension = self._video_extensions.get(format_param, 'flv')
 505
 506             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 507                                               self._video_dimensions.get(format_param, '???'))
 508
 509             results.append({
 510                 'id':       video_id,
 511                 'url':      video_real_url,
 512                 'uploader': video_uploader,
 513                 'uploader_id': video_uploader_id,
 514                 'upload_date':  upload_date,
 515                 'title':    video_title,
 516                 'ext':      video_extension,
 517                 'format':   video_format,
 518                 'thumbnail':    video_thumbnail,
 519                 'description':  video_description,
 520                 'player_url':   player_url,
 521                 'subtitles':    video_subtitles,
 522                 'duration':     video_duration
 523             })
 524         return results
 525
 526
 527 class MetacafeIE(InfoExtractor):
 528     """Information Extractor for metacafe.com."""
 529
 530     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 531     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 532     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 533     IE_NAME = u'metacafe'
 534
 535     def __init__(self, downloader=None):
 536         InfoExtractor.__init__(self, downloader)
 537
 538     def report_disclaimer(self):
 539         """Report disclaimer retrieval."""
 540         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 541
 542     def report_age_confirmation(self):
 543         """Report attempt to confirm age."""
 544         self._downloader.to_screen(u'[metacafe] Confirming age')
 545
 546     def report_download_webpage(self, video_id):
 547         """Report webpage download."""
 548         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 549
 550     def report_extraction(self, video_id):
 551         """Report information extraction."""
 552         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 553
 554     def _real_initialize(self):
 555         # Retrieve disclaimer
 556         request = compat_urllib_request.Request(self._DISCLAIMER)
 557         try:
 558             self.report_disclaimer()
 559             disclaimer = compat_urllib_request.urlopen(request).read()
 560         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 561             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 562             return
 563
 564         # Confirm age
 565         disclaimer_form = {
 566             'filters': '0',
 567             'submit': "Continue - I'm over 18",
 568             }
 569         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 570         try:
 571             self.report_age_confirmation()
 572             disclaimer = compat_urllib_request.urlopen(request).read()
 573         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 574             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 575             return
 576
 577     def _real_extract(self, url):
 578         # Extract id and simplified title from URL
 579         mobj = re.match(self._VALID_URL, url)
 580         if mobj is None:
 581             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 582             return
 583
 584         video_id = mobj.group(1)
 585
 586         # Check if video comes from YouTube
 587         mobj2 = re.match(r'^yt-(.*)$', video_id)
 588         if mobj2 is not None:
 589             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 590             return
 591
 592         # Retrieve video webpage to extract further information
 593         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 594         try:
 595             self.report_download_webpage(video_id)
 596             webpage = compat_urllib_request.urlopen(request).read()
 597         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 598             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 599             return
 600
 601         # Extract URL, uploader and title from webpage
 602         self.report_extraction(video_id)
 603         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 604         if mobj is not None:
 605             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 606             video_extension = mediaURL[-3:]
 607
 608             # Extract gdaKey if available
 609             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 610             if mobj is None:
 611                 video_url = mediaURL
 612             else:
 613                 gdaKey = mobj.group(1)
 614                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 615         else:
 616             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 617             if mobj is None:
 618                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 619                 return
 620             vardict = compat_parse_qs(mobj.group(1))
 621             if 'mediaData' not in vardict:
 622                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 623                 return
 624             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 625             if mobj is None:
 626                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 627                 return
 628             mediaURL = mobj.group(1).replace('\\/', '/')
 629             video_extension = mediaURL[-3:]
 630             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 631
 632         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 633         if mobj is None:
 634             self._downloader.trouble(u'ERROR: unable to extract title')
 635             return
 636         video_title = mobj.group(1).decode('utf-8')
 637
 638         mobj = re.search(r'submitter=(.*?);', webpage)
 639         if mobj is None:
 640             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 641             return
 642         video_uploader = mobj.group(1)
 643
 644         return [{
 645             'id':       video_id.decode('utf-8'),
 646             'url':      video_url.decode('utf-8'),
 647             'uploader': video_uploader.decode('utf-8'),
 648             'upload_date':  None,
 649             'title':    video_title,
 650             'ext':      video_extension.decode('utf-8'),
 651         }]
 652
 653
 654 class DailymotionIE(InfoExtractor):
 655     """Information Extractor for Dailymotion"""
 656
 657     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 658     IE_NAME = u'dailymotion'
 659
 660     def __init__(self, downloader=None):
 661         InfoExtractor.__init__(self, downloader)
 662
 663     def report_download_webpage(self, video_id):
 664         """Report webpage download."""
 665         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 666
 667     def report_extraction(self, video_id):
 668         """Report information extraction."""
 669         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 670
 671     def _real_extract(self, url):
 672         # Extract id and simplified title from URL
 673         mobj = re.match(self._VALID_URL, url)
 674         if mobj is None:
 675             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 676             return
 677
 678         video_id = mobj.group(1).split('_')[0].split('?')[0]
 679
 680         video_extension = 'mp4'
 681
 682         # Retrieve video webpage to extract further information
 683         request = compat_urllib_request.Request(url)
 684         request.add_header('Cookie', 'family_filter=off')
 685         try:
 686             self.report_download_webpage(video_id)
 687             webpage_bytes = compat_urllib_request.urlopen(request).read()
 688             webpage = webpage_bytes.decode('utf-8')
 689         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 690             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 691             return
 692
 693         # Extract URL, uploader and title from webpage
 694         self.report_extraction(video_id)
 695         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 696         if mobj is None:
 697             self._downloader.trouble(u'ERROR: unable to extract media URL')
 698             return
 699         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 700
 701         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 702             if key in flashvars:
 703                 max_quality = key
 704                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 705                 break
 706         else:
 707             self._downloader.trouble(u'ERROR: unable to extract video URL')
 708             return
 709
 710         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 711         if mobj is None:
 712             self._downloader.trouble(u'ERROR: unable to extract video URL')
 713             return
 714
 715         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 716
 717         # TODO: support choosing qualities
 718
 719         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 720         if mobj is None:
 721             self._downloader.trouble(u'ERROR: unable to extract title')
 722             return
 723         video_title = unescapeHTML(mobj.group('title'))
 724
 725         video_uploader = None
 726         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 727         if mobj is None:
 728             # lookin for official user
 729             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 730             if mobj_official is None:
 731                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 732             else:
 733                 video_uploader = mobj_official.group(1)
 734         else:
 735             video_uploader = mobj.group(1)
 736
 737         video_upload_date = None
 738         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 739         if mobj is not None:
 740             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 741
 742         return [{
 743             'id':       video_id,
 744             'url':      video_url,
 745             'uploader': video_uploader,
 746             'upload_date':  video_upload_date,
 747             'title':    video_title,
 748             'ext':      video_extension,
 749         }]
 750
 751
 752 class PhotobucketIE(InfoExtractor):
 753     """Information extractor for photobucket.com."""
 754
 755     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 756     IE_NAME = u'photobucket'
 757
 758     def __init__(self, downloader=None):
 759         InfoExtractor.__init__(self, downloader)
 760
 761     def report_download_webpage(self, video_id):
 762         """Report webpage download."""
 763         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 764
 765     def report_extraction(self, video_id):
 766         """Report information extraction."""
 767         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 768
 769     def _real_extract(self, url):
 770         # Extract id from URL
 771         mobj = re.match(self._VALID_URL, url)
 772         if mobj is None:
 773             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 774             return
 775
 776         video_id = mobj.group(1)
 777
 778         video_extension = 'flv'
 779
 780         # Retrieve video webpage to extract further information
 781         request = compat_urllib_request.Request(url)
 782         try:
 783             self.report_download_webpage(video_id)
 784             webpage = compat_urllib_request.urlopen(request).read()
 785         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 786             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 787             return
 788
 789         # Extract URL, uploader, and title from webpage
 790         self.report_extraction(video_id)
 791         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 792         if mobj is None:
 793             self._downloader.trouble(u'ERROR: unable to extract media URL')
 794             return
 795         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 796
 797         video_url = mediaURL
 798
 799         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 800         if mobj is None:
 801             self._downloader.trouble(u'ERROR: unable to extract title')
 802             return
 803         video_title = mobj.group(1).decode('utf-8')
 804
 805         video_uploader = mobj.group(2).decode('utf-8')
 806
 807         return [{
 808             'id':       video_id.decode('utf-8'),
 809             'url':      video_url.decode('utf-8'),
 810             'uploader': video_uploader,
 811             'upload_date':  None,
 812             'title':    video_title,
 813             'ext':      video_extension.decode('utf-8'),
 814         }]
 815
 816
 817 class YahooIE(InfoExtractor):
 818     """Information extractor for video.yahoo.com."""
 819
 820     _WORKING = False
 821     # _VALID_URL matches all Yahoo! Video URLs
 822     # _VPAGE_URL matches only the extractable '/watch/' URLs
 823     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 824     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 825     IE_NAME = u'video.yahoo'
 826
 827     def __init__(self, downloader=None):
 828         InfoExtractor.__init__(self, downloader)
 829
 830     def report_download_webpage(self, video_id):
 831         """Report webpage download."""
 832         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 833
 834     def report_extraction(self, video_id):
 835         """Report information extraction."""
 836         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 837
 838     def _real_extract(self, url, new_video=True):
 839         # Extract ID from URL
 840         mobj = re.match(self._VALID_URL, url)
 841         if mobj is None:
 842             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 843             return
 844
 845         video_id = mobj.group(2)
 846         video_extension = 'flv'
 847
 848         # Rewrite valid but non-extractable URLs as
 849         # extractable English language /watch/ URLs
 850         if re.match(self._VPAGE_URL, url) is None:
 851             request = compat_urllib_request.Request(url)
 852             try:
 853                 webpage = compat_urllib_request.urlopen(request).read()
 854             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 855                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 856                 return
 857
 858             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 859             if mobj is None:
 860                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 861                 return
 862             yahoo_id = mobj.group(1)
 863
 864             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 865             if mobj is None:
 866                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 867                 return
 868             yahoo_vid = mobj.group(1)
 869
 870             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 871             return self._real_extract(url, new_video=False)
 872
 873         # Retrieve video webpage to extract further information
 874         request = compat_urllib_request.Request(url)
 875         try:
 876             self.report_download_webpage(video_id)
 877             webpage = compat_urllib_request.urlopen(request).read()
 878         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 879             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 880             return
 881
 882         # Extract uploader and title from webpage
 883         self.report_extraction(video_id)
 884         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 885         if mobj is None:
 886             self._downloader.trouble(u'ERROR: unable to extract video title')
 887             return
 888         video_title = mobj.group(1).decode('utf-8')
 889
 890         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 891         if mobj is None:
 892             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 893             return
 894         video_uploader = mobj.group(1).decode('utf-8')
 895
 896         # Extract video thumbnail
 897         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 898         if mobj is None:
 899             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 900             return
 901         video_thumbnail = mobj.group(1).decode('utf-8')
 902
 903         # Extract video description
 904         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 905         if mobj is None:
 906             self._downloader.trouble(u'ERROR: unable to extract video description')
 907             return
 908         video_description = mobj.group(1).decode('utf-8')
 909         if not video_description:
 910             video_description = 'No description available.'
 911
 912         # Extract video height and width
 913         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 914         if mobj is None:
 915             self._downloader.trouble(u'ERROR: unable to extract video height')
 916             return
 917         yv_video_height = mobj.group(1)
 918
 919         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 920         if mobj is None:
 921             self._downloader.trouble(u'ERROR: unable to extract video width')
 922             return
 923         yv_video_width = mobj.group(1)
 924
 925         # Retrieve video playlist to extract media URL
 926         # I'm not completely sure what all these options are, but we
 927         # seem to need most of them, otherwise the server sends a 401.
 928         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 929         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 930         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 931                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 932                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 933         try:
 934             self.report_download_webpage(video_id)
 935             webpage = compat_urllib_request.urlopen(request).read()
 936         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 937             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 938             return
 939
 940         # Extract media URL from playlist XML
 941         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 942         if mobj is None:
 943             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 944             return
 945         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 946         video_url = unescapeHTML(video_url)
 947
 948         return [{
 949             'id':       video_id.decode('utf-8'),
 950             'url':      video_url,
 951             'uploader': video_uploader,
 952             'upload_date':  None,
 953             'title':    video_title,
 954             'ext':      video_extension.decode('utf-8'),
 955             'thumbnail':    video_thumbnail.decode('utf-8'),
 956             'description':  video_description,
 957         }]
 958
 959
 960 class VimeoIE(InfoExtractor):
 961     """Information extractor for vimeo.com."""
 962
 963     # _VALID_URL matches Vimeo URLs
 964     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 965     IE_NAME = u'vimeo'
 966
 967     def __init__(self, downloader=None):
 968         InfoExtractor.__init__(self, downloader)
 969
 970     def report_download_webpage(self, video_id):
 971         """Report webpage download."""
 972         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 973
 974     def report_extraction(self, video_id):
 975         """Report information extraction."""
 976         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 977
 978     def _real_extract(self, url, new_video=True):
 979         # Extract ID from URL
 980         mobj = re.match(self._VALID_URL, url)
 981         if mobj is None:
 982             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 983             return
 984
 985         video_id = mobj.group(1)
 986
 987         # Retrieve video webpage to extract further information
 988         request = compat_urllib_request.Request(url, None, std_headers)
 989         try:
 990             self.report_download_webpage(video_id)
 991             webpage_bytes = compat_urllib_request.urlopen(request).read()
 992             webpage = webpage_bytes.decode('utf-8')
 993         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 994             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 995             return
 996
 997         # Now we begin extracting as much information as we can from what we
 998         # retrieved. First we extract the information common to all extractors,
 999         # and latter we extract those that are Vimeo specific.
1000         self.report_extraction(video_id)
1001
1002         # Extract the config JSON
1003         try:
1004             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1005             config = json.loads(config)
1006         except:
1007             self._downloader.trouble(u'ERROR: unable to extract info section')
1008             return
1009
1010         # Extract title
1011         video_title = config["video"]["title"]
1012
1013         # Extract uploader and uploader_id
1014         video_uploader = config["video"]["owner"]["name"]
1015         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1016
1017         # Extract video thumbnail
1018         video_thumbnail = config["video"]["thumbnail"]
1019
1020         # Extract video description
1021         video_description = get_element_by_attribute("itemprop", "description", webpage)
1022         if video_description: video_description = clean_html(video_description)
1023         else: video_description = ''
1024
1025         # Extract upload date
1026         video_upload_date = None
1027         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1028         if mobj is not None:
1029             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1030
1031         # Vimeo specific: extract request signature and timestamp
1032         sig = config['request']['signature']
1033         timestamp = config['request']['timestamp']
1034
1035         # Vimeo specific: extract video codec and quality information
1036         # First consider quality, then codecs, then take everything
1037         # TODO bind to format param
1038         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1039         files = { 'hd': [], 'sd': [], 'other': []}
1040         for codec_name, codec_extension in codecs:
1041             if codec_name in config["video"]["files"]:
1042                 if 'hd' in config["video"]["files"][codec_name]:
1043                     files['hd'].append((codec_name, codec_extension, 'hd'))
1044                 elif 'sd' in config["video"]["files"][codec_name]:
1045                     files['sd'].append((codec_name, codec_extension, 'sd'))
1046                 else:
1047                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1048
1049         for quality in ('hd', 'sd', 'other'):
1050             if len(files[quality]) > 0:
1051                 video_quality = files[quality][0][2]
1052                 video_codec = files[quality][0][0]
1053                 video_extension = files[quality][0][1]
1054                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1055                 break
1056         else:
1057             self._downloader.trouble(u'ERROR: no known codec found')
1058             return
1059
1060         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1061                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1062
1063         return [{
1064             'id':       video_id,
1065             'url':      video_url,
1066             'uploader': video_uploader,
1067             'uploader_id': video_uploader_id,
1068             'upload_date':  video_upload_date,
1069             'title':    video_title,
1070             'ext':      video_extension,
1071             'thumbnail':    video_thumbnail,
1072             'description':  video_description,
1073         }]
1074
1075
1076 class ArteTvIE(InfoExtractor):
1077     """arte.tv information extractor."""
1078
1079     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1080     _LIVE_URL = r'index-[0-9]+\.html$'
1081
1082     IE_NAME = u'arte.tv'
1083
1084     def __init__(self, downloader=None):
1085         InfoExtractor.__init__(self, downloader)
1086
1087     def report_download_webpage(self, video_id):
1088         """Report webpage download."""
1089         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1090
1091     def report_extraction(self, video_id):
1092         """Report information extraction."""
1093         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1094
1095     def fetch_webpage(self, url):
1096         self._downloader.increment_downloads()
1097         request = compat_urllib_request.Request(url)
1098         try:
1099             self.report_download_webpage(url)
1100             webpage = compat_urllib_request.urlopen(request).read()
1101         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1102             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1103             return
1104         except ValueError as err:
1105             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1106             return
1107         return webpage
1108
1109     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1110         page = self.fetch_webpage(url)
1111         mobj = re.search(regex, page, regexFlags)
1112         info = {}
1113
1114         if mobj is None:
1115             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1116             return
1117
1118         for (i, key, err) in matchTuples:
1119             if mobj.group(i) is None:
1120                 self._downloader.trouble(err)
1121                 return
1122             else:
1123                 info[key] = mobj.group(i)
1124
1125         return info
1126
1127     def extractLiveStream(self, url):
1128         video_lang = url.split('/')[-4]
1129         info = self.grep_webpage(
1130             url,
1131             r'src="(.*?/videothek_js.*?\.js)',
1132             0,
1133             [
1134                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1135             ]
1136         )
1137         http_host = url.split('/')[2]
1138         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1139         info = self.grep_webpage(
1140             next_url,
1141             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1142                 '(http://.*?\.swf).*?' +
1143                 '(rtmp://.*?)\'',
1144             re.DOTALL,
1145             [
1146                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1147                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1148                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1149             ]
1150         )
1151         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1152
1153     def extractPlus7Stream(self, url):
1154         video_lang = url.split('/')[-3]
1155         info = self.grep_webpage(
1156             url,
1157             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1158             0,
1159             [
1160                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1161             ]
1162         )
1163         next_url = compat_urllib_parse.unquote(info.get('url'))
1164         info = self.grep_webpage(
1165             next_url,
1166             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1167             0,
1168             [
1169                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1170             ]
1171         )
1172         next_url = compat_urllib_parse.unquote(info.get('url'))
1173
1174         info = self.grep_webpage(
1175             next_url,
1176             r'<video id="(.*?)".*?>.*?' +
1177                 '<name>(.*?)</name>.*?' +
1178                 '<dateVideo>(.*?)</dateVideo>.*?' +
1179                 '<url quality="hd">(.*?)</url>',
1180             re.DOTALL,
1181             [
1182                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1183                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1184                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1185                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1186             ]
1187         )
1188
1189         return {
1190             'id':           info.get('id'),
1191             'url':          compat_urllib_parse.unquote(info.get('url')),
1192             'uploader':     u'arte.tv',
1193             'upload_date':  info.get('date'),
1194             'title':        info.get('title').decode('utf-8'),
1195             'ext':          u'mp4',
1196             'format':       u'NA',
1197             'player_url':   None,
1198         }
1199
1200     def _real_extract(self, url):
1201         video_id = url.split('/')[-1]
1202         self.report_extraction(video_id)
1203
1204         if re.search(self._LIVE_URL, video_id) is not None:
1205             self.extractLiveStream(url)
1206             return
1207         else:
1208             info = self.extractPlus7Stream(url)
1209
1210         return [info]
1211
1212
1213 class GenericIE(InfoExtractor):
1214     """Generic last-resort information extractor."""
1215
1216     _VALID_URL = r'.*'
1217     IE_NAME = u'generic'
1218
1219     def __init__(self, downloader=None):
1220         InfoExtractor.__init__(self, downloader)
1221
1222     def report_download_webpage(self, video_id):
1223         """Report webpage download."""
1224         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1225         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1226
1227     def report_extraction(self, video_id):
1228         """Report information extraction."""
1229         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1230
1231     def report_following_redirect(self, new_url):
1232         """Report information extraction."""
1233         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1234
1235     def _test_redirect(self, url):
1236         """Check if it is a redirect, like url shorteners, in case restart chain."""
1237         class HeadRequest(compat_urllib_request.Request):
1238             def get_method(self):
1239                 return "HEAD"
1240
1241         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1242             """
1243             Subclass the HTTPRedirectHandler to make it use our
1244             HeadRequest also on the redirected URL
1245             """
1246             def redirect_request(self, req, fp, code, msg, headers, newurl):
1247                 if code in (301, 302, 303, 307):
1248                     newurl = newurl.replace(' ', '%20')
1249                     newheaders = dict((k,v) for k,v in req.headers.items()
1250                                       if k.lower() not in ("content-length", "content-type"))
1251                     return HeadRequest(newurl,
1252                                        headers=newheaders,
1253                                        origin_req_host=req.get_origin_req_host(),
1254                                        unverifiable=True)
1255                 else:
1256                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1257
1258         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1259             """
1260             Fallback to GET if HEAD is not allowed (405 HTTP error)
1261             """
1262             def http_error_405(self, req, fp, code, msg, headers):
1263                 fp.read()
1264                 fp.close()
1265
1266                 newheaders = dict((k,v) for k,v in req.headers.items()
1267                                   if k.lower() not in ("content-length", "content-type"))
1268                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1269                                                  headers=newheaders,
1270                                                  origin_req_host=req.get_origin_req_host(),
1271                                                  unverifiable=True))
1272
1273         # Build our opener
1274         opener = compat_urllib_request.OpenerDirector()
1275         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1276                         HTTPMethodFallback, HEADRedirectHandler,
1277                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1278             opener.add_handler(handler())
1279
1280         response = opener.open(HeadRequest(url))
1281         new_url = response.geturl()
1282
1283         if url == new_url:
1284             return False
1285
1286         self.report_following_redirect(new_url)
1287         self._downloader.download([new_url])
1288         return True
1289
1290     def _real_extract(self, url):
1291         if self._test_redirect(url): return
1292
1293         video_id = url.split('/')[-1]
1294         request = compat_urllib_request.Request(url)
1295         try:
1296             self.report_download_webpage(video_id)
1297             webpage = compat_urllib_request.urlopen(request).read()
1298         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1299             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1300             return
1301         except ValueError as err:
1302             # since this is the last-resort InfoExtractor, if
1303             # this error is thrown, it'll be thrown here
1304             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1305             return
1306
1307         self.report_extraction(video_id)
1308         # Start with something easy: JW Player in SWFObject
1309         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1310         if mobj is None:
1311             # Broaden the search a little bit
1312             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1313         if mobj is None:
1314             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1315             return
1316
1317         # It's possible that one of the regexes
1318         # matched, but returned an empty group:
1319         if mobj.group(1) is None:
1320             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1321             return
1322
1323         video_url = compat_urllib_parse.unquote(mobj.group(1))
1324         video_id = os.path.basename(video_url)
1325
1326         # here's a fun little line of code for you:
1327         video_extension = os.path.splitext(video_id)[1][1:]
1328         video_id = os.path.splitext(video_id)[0]
1329
1330         # it's tempting to parse this further, but you would
1331         # have to take into account all the variations like
1332         #   Video Title - Site Name
1333         #   Site Name | Video Title
1334         #   Video Title - Tagline | Site Name
1335         # and so on and so forth; it's just not practical
1336         mobj = re.search(r'<title>(.*)</title>', webpage)
1337         if mobj is None:
1338             self._downloader.trouble(u'ERROR: unable to extract title')
1339             return
1340         video_title = mobj.group(1)
1341
1342         # video uploader is domain name
1343         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1344         if mobj is None:
1345             self._downloader.trouble(u'ERROR: unable to extract title')
1346             return
1347         video_uploader = mobj.group(1)
1348
1349         return [{
1350             'id':       video_id,
1351             'url':      video_url,
1352             'uploader': video_uploader,
1353             'upload_date':  None,
1354             'title':    video_title,
1355             'ext':      video_extension,
1356         }]
1357
1358
1359 class YoutubeSearchIE(InfoExtractor):
1360     """Information Extractor for YouTube search queries."""
1361     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1362     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1363     _max_youtube_results = 1000
1364     IE_NAME = u'youtube:search'
1365
1366     def __init__(self, downloader=None):
1367         InfoExtractor.__init__(self, downloader)
1368
1369     def report_download_page(self, query, pagenum):
1370         """Report attempt to download search page with given number."""
1371         query = query.decode(preferredencoding())
1372         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1373
1374     def _real_extract(self, query):
1375         mobj = re.match(self._VALID_URL, query)
1376         if mobj is None:
1377             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1378             return
1379
1380         prefix, query = query.split(':')
1381         prefix = prefix[8:]
1382         query = query.encode('utf-8')
1383         if prefix == '':
1384             self._download_n_results(query, 1)
1385             return
1386         elif prefix == 'all':
1387             self._download_n_results(query, self._max_youtube_results)
1388             return
1389         else:
1390             try:
1391                 n = int(prefix)
1392                 if n <= 0:
1393                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1394                     return
1395                 elif n > self._max_youtube_results:
1396                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1397                     n = self._max_youtube_results
1398                 self._download_n_results(query, n)
1399                 return
1400             except ValueError: # parsing prefix as integer fails
1401                 self._download_n_results(query, 1)
1402                 return
1403
1404     def _download_n_results(self, query, n):
1405         """Downloads a specified number of results for a query"""
1406
1407         video_ids = []
1408         pagenum = 0
1409         limit = n
1410
1411         while (50 * pagenum) < limit:
1412             self.report_download_page(query, pagenum+1)
1413             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1414             request = compat_urllib_request.Request(result_url)
1415             try:
1416                 data = compat_urllib_request.urlopen(request).read()
1417             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1418                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1419                 return
1420             api_response = json.loads(data)['data']
1421
1422             new_ids = list(video['id'] for video in api_response['items'])
1423             video_ids += new_ids
1424
1425             limit = min(n, api_response['totalItems'])
1426             pagenum += 1
1427
1428         if len(video_ids) > n:
1429             video_ids = video_ids[:n]
1430         for id in video_ids:
1431             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1432         return
1433
1434
1435 class GoogleSearchIE(InfoExtractor):
1436     """Information Extractor for Google Video search queries."""
1437     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1438     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1439     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1440     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1441     _max_google_results = 1000
1442     IE_NAME = u'video.google:search'
1443
1444     def __init__(self, downloader=None):
1445         InfoExtractor.__init__(self, downloader)
1446
1447     def report_download_page(self, query, pagenum):
1448         """Report attempt to download playlist page with given number."""
1449         query = query.decode(preferredencoding())
1450         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1451
1452     def _real_extract(self, query):
1453         mobj = re.match(self._VALID_URL, query)
1454         if mobj is None:
1455             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1456             return
1457
1458         prefix, query = query.split(':')
1459         prefix = prefix[8:]
1460         query = query.encode('utf-8')
1461         if prefix == '':
1462             self._download_n_results(query, 1)
1463             return
1464         elif prefix == 'all':
1465             self._download_n_results(query, self._max_google_results)
1466             return
1467         else:
1468             try:
1469                 n = int(prefix)
1470                 if n <= 0:
1471                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1472                     return
1473                 elif n > self._max_google_results:
1474                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1475                     n = self._max_google_results
1476                 self._download_n_results(query, n)
1477                 return
1478             except ValueError: # parsing prefix as integer fails
1479                 self._download_n_results(query, 1)
1480                 return
1481
1482     def _download_n_results(self, query, n):
1483         """Downloads a specified number of results for a query"""
1484
1485         video_ids = []
1486         pagenum = 0
1487
1488         while True:
1489             self.report_download_page(query, pagenum)
1490             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1491             request = compat_urllib_request.Request(result_url)
1492             try:
1493                 page = compat_urllib_request.urlopen(request).read()
1494             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1495                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1496                 return
1497
1498             # Extract video identifiers
1499             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1500                 video_id = mobj.group(1)
1501                 if video_id not in video_ids:
1502                     video_ids.append(video_id)
1503                     if len(video_ids) == n:
1504                         # Specified n videos reached
1505                         for id in video_ids:
1506                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1507                         return
1508
1509             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1510                 for id in video_ids:
1511                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1512                 return
1513
1514             pagenum = pagenum + 1
1515
1516
1517 class YahooSearchIE(InfoExtractor):
1518     """Information Extractor for Yahoo! Video search queries."""
1519
1520     _WORKING = False
1521     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1522     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1523     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1524     _MORE_PAGES_INDICATOR = r'\s*Next'
1525     _max_yahoo_results = 1000
1526     IE_NAME = u'video.yahoo:search'
1527
1528     def __init__(self, downloader=None):
1529         InfoExtractor.__init__(self, downloader)
1530
1531     def report_download_page(self, query, pagenum):
1532         """Report attempt to download playlist page with given number."""
1533         query = query.decode(preferredencoding())
1534         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1535
1536     def _real_extract(self, query):
1537         mobj = re.match(self._VALID_URL, query)
1538         if mobj is None:
1539             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1540             return
1541
1542         prefix, query = query.split(':')
1543         prefix = prefix[8:]
1544         query = query.encode('utf-8')
1545         if prefix == '':
1546             self._download_n_results(query, 1)
1547             return
1548         elif prefix == 'all':
1549             self._download_n_results(query, self._max_yahoo_results)
1550             return
1551         else:
1552             try:
1553                 n = int(prefix)
1554                 if n <= 0:
1555                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1556                     return
1557                 elif n > self._max_yahoo_results:
1558                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1559                     n = self._max_yahoo_results
1560                 self._download_n_results(query, n)
1561                 return
1562             except ValueError: # parsing prefix as integer fails
1563                 self._download_n_results(query, 1)
1564                 return
1565
1566     def _download_n_results(self, query, n):
1567         """Downloads a specified number of results for a query"""
1568
1569         video_ids = []
1570         already_seen = set()
1571         pagenum = 1
1572
1573         while True:
1574             self.report_download_page(query, pagenum)
1575             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1576             request = compat_urllib_request.Request(result_url)
1577             try:
1578                 page = compat_urllib_request.urlopen(request).read()
1579             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1580                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1581                 return
1582
1583             # Extract video identifiers
1584             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1585                 video_id = mobj.group(1)
1586                 if video_id not in already_seen:
1587                     video_ids.append(video_id)
1588                     already_seen.add(video_id)
1589                     if len(video_ids) == n:
1590                         # Specified n videos reached
1591                         for id in video_ids:
1592                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1593                         return
1594
1595             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1596                 for id in video_ids:
1597                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1598                 return
1599
1600             pagenum = pagenum + 1
1601
1602
1603 class YoutubePlaylistIE(InfoExtractor):
1604     """Information Extractor for YouTube playlists."""
1605
1606     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1607     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1608     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1609     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1610     IE_NAME = u'youtube:playlist'
1611
1612     def __init__(self, downloader=None):
1613         InfoExtractor.__init__(self, downloader)
1614
1615     def report_download_page(self, playlist_id, pagenum):
1616         """Report attempt to download playlist page with given number."""
1617         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1618
1619     def _real_extract(self, url):
1620         # Extract playlist id
1621         mobj = re.match(self._VALID_URL, url)
1622         if mobj is None:
1623             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1624             return
1625
1626         # Single video case
1627         if mobj.group(3) is not None:
1628             self._downloader.download([mobj.group(3)])
1629             return
1630
1631         # Download playlist pages
1632         # prefix is 'p' as default for playlists but there are other types that need extra care
1633         playlist_prefix = mobj.group(1)
1634         if playlist_prefix == 'a':
1635             playlist_access = 'artist'
1636         else:
1637             playlist_prefix = 'p'
1638             playlist_access = 'view_play_list'
1639         playlist_id = mobj.group(2)
1640         video_ids = []
1641         pagenum = 1
1642
1643         while True:
1644             self.report_download_page(playlist_id, pagenum)
1645             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1646             request = compat_urllib_request.Request(url)
1647             try:
1648                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1649             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1650                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1651                 return
1652
1653             # Extract video identifiers
1654             ids_in_page = []
1655             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1656                 if mobj.group(1) not in ids_in_page:
1657                     ids_in_page.append(mobj.group(1))
1658             video_ids.extend(ids_in_page)
1659
1660             if self._MORE_PAGES_INDICATOR not in page:
1661                 break
1662             pagenum = pagenum + 1
1663
1664         total = len(video_ids)
1665
1666         playliststart = self._downloader.params.get('playliststart', 1) - 1
1667         playlistend = self._downloader.params.get('playlistend', -1)
1668         if playlistend == -1:
1669             video_ids = video_ids[playliststart:]
1670         else:
1671             video_ids = video_ids[playliststart:playlistend]
1672
1673         if len(video_ids) == total:
1674             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1675         else:
1676             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1677
1678         for id in video_ids:
1679             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1680         return
1681
1682
1683 class YoutubeChannelIE(InfoExtractor):
1684     """Information Extractor for YouTube channels."""
1685
1686     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1687     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1688     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1689     IE_NAME = u'youtube:channel'
1690
1691     def report_download_page(self, channel_id, pagenum):
1692         """Report attempt to download channel page with given number."""
1693         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1694
1695     def _real_extract(self, url):
1696         # Extract channel id
1697         mobj = re.match(self._VALID_URL, url)
1698         if mobj is None:
1699             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1700             return
1701
1702         # Download channel pages
1703         channel_id = mobj.group(1)
1704         video_ids = []
1705         pagenum = 1
1706
1707         while True:
1708             self.report_download_page(channel_id, pagenum)
1709             url = self._TEMPLATE_URL % (channel_id, pagenum)
1710             request = compat_urllib_request.Request(url)
1711             try:
1712                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1713             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1714                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1715                 return
1716
1717             # Extract video identifiers
1718             ids_in_page = []
1719             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1720                 if mobj.group(1) not in ids_in_page:
1721                     ids_in_page.append(mobj.group(1))
1722             video_ids.extend(ids_in_page)
1723
1724             if self._MORE_PAGES_INDICATOR not in page:
1725                 break
1726             pagenum = pagenum + 1
1727
1728         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1729
1730         for id in video_ids:
1731             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1732         return
1733
1734
1735 class YoutubeUserIE(InfoExtractor):
1736     """Information Extractor for YouTube users."""
1737
1738     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1739     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1740     _GDATA_PAGE_SIZE = 50
1741     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1742     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1743     IE_NAME = u'youtube:user'
1744
1745     def __init__(self, downloader=None):
1746         InfoExtractor.__init__(self, downloader)
1747
1748     def report_download_page(self, username, start_index):
1749         """Report attempt to download user page."""
1750         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1751                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1752
1753     def _real_extract(self, url):
1754         # Extract username
1755         mobj = re.match(self._VALID_URL, url)
1756         if mobj is None:
1757             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1758             return
1759
1760         username = mobj.group(1)
1761
1762         # Download video ids using YouTube Data API. Result size per
1763         # query is limited (currently to 50 videos) so we need to query
1764         # page by page until there are no video ids - it means we got
1765         # all of them.
1766
1767         video_ids = []
1768         pagenum = 0
1769
1770         while True:
1771             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1772             self.report_download_page(username, start_index)
1773
1774             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1775
1776             try:
1777                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1778             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1779                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1780                 return
1781
1782             # Extract video identifiers
1783             ids_in_page = []
1784
1785             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1786                 if mobj.group(1) not in ids_in_page:
1787                     ids_in_page.append(mobj.group(1))
1788
1789             video_ids.extend(ids_in_page)
1790
1791             # A little optimization - if current page is not
1792             # "full", ie. does not contain PAGE_SIZE video ids then
1793             # we can assume that this page is the last one - there
1794             # are no more ids on further pages - no need to query
1795             # again.
1796
1797             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1798                 break
1799
1800             pagenum += 1
1801
1802         all_ids_count = len(video_ids)
1803         playliststart = self._downloader.params.get('playliststart', 1) - 1
1804         playlistend = self._downloader.params.get('playlistend', -1)
1805
1806         if playlistend == -1:
1807             video_ids = video_ids[playliststart:]
1808         else:
1809             video_ids = video_ids[playliststart:playlistend]
1810
1811         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1812                 (username, all_ids_count, len(video_ids)))
1813
1814         for video_id in video_ids:
1815             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1816
1817
1818 class BlipTVUserIE(InfoExtractor):
1819     """Information Extractor for blip.tv users."""
1820
1821     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1822     _PAGE_SIZE = 12
1823     IE_NAME = u'blip.tv:user'
1824
1825     def __init__(self, downloader=None):
1826         InfoExtractor.__init__(self, downloader)
1827
1828     def report_download_page(self, username, pagenum):
1829         """Report attempt to download user page."""
1830         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1831                 (self.IE_NAME, username, pagenum))
1832
1833     def _real_extract(self, url):
1834         # Extract username
1835         mobj = re.match(self._VALID_URL, url)
1836         if mobj is None:
1837             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1838             return
1839
1840         username = mobj.group(1)
1841
1842         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1843
1844         request = compat_urllib_request.Request(url)
1845
1846         try:
1847             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1848             mobj = re.search(r'data-users-id="([^"]+)"', page)
1849             page_base = page_base % mobj.group(1)
1850         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1851             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1852             return
1853
1854
1855         # Download video ids using BlipTV Ajax calls. Result size per
1856         # query is limited (currently to 12 videos) so we need to query
1857         # page by page until there are no video ids - it means we got
1858         # all of them.
1859
1860         video_ids = []
1861         pagenum = 1
1862
1863         while True:
1864             self.report_download_page(username, pagenum)
1865
1866             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1867
1868             try:
1869                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1870             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1871                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1872                 return
1873
1874             # Extract video identifiers
1875             ids_in_page = []
1876
1877             for mobj in re.finditer(r'href="/([^"]+)"', page):
1878                 if mobj.group(1) not in ids_in_page:
1879                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1880
1881             video_ids.extend(ids_in_page)
1882
1883             # A little optimization - if current page is not
1884             # "full", ie. does not contain PAGE_SIZE video ids then
1885             # we can assume that this page is the last one - there
1886             # are no more ids on further pages - no need to query
1887             # again.
1888
1889             if len(ids_in_page) < self._PAGE_SIZE:
1890                 break
1891
1892             pagenum += 1
1893
1894         all_ids_count = len(video_ids)
1895         playliststart = self._downloader.params.get('playliststart', 1) - 1
1896         playlistend = self._downloader.params.get('playlistend', -1)
1897
1898         if playlistend == -1:
1899             video_ids = video_ids[playliststart:]
1900         else:
1901             video_ids = video_ids[playliststart:playlistend]
1902
1903         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1904                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1905
1906         for video_id in video_ids:
1907             self._downloader.download([u'http://blip.tv/'+video_id])
1908
1909
1910 class DepositFilesIE(InfoExtractor):
1911     """Information extractor for depositfiles.com"""
1912
1913     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1914     IE_NAME = u'DepositFiles'
1915
1916     def __init__(self, downloader=None):
1917         InfoExtractor.__init__(self, downloader)
1918
1919     def report_download_webpage(self, file_id):
1920         """Report webpage download."""
1921         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1922
1923     def report_extraction(self, file_id):
1924         """Report information extraction."""
1925         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1926
1927     def _real_extract(self, url):
1928         file_id = url.split('/')[-1]
1929         # Rebuild url in english locale
1930         url = 'http://depositfiles.com/en/files/' + file_id
1931
1932         # Retrieve file webpage with 'Free download' button pressed
1933         free_download_indication = { 'gateway_result' : '1' }
1934         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1935         try:
1936             self.report_download_webpage(file_id)
1937             webpage = compat_urllib_request.urlopen(request).read()
1938         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1939             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1940             return
1941
1942         # Search for the real file URL
1943         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1944         if (mobj is None) or (mobj.group(1) is None):
1945             # Try to figure out reason of the error.
1946             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1947             if (mobj is not None) and (mobj.group(1) is not None):
1948                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1949                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1950             else:
1951                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1952             return
1953
1954         file_url = mobj.group(1)
1955         file_extension = os.path.splitext(file_url)[1][1:]
1956
1957         # Search for file title
1958         mobj = re.search(r'<b title="(.*?)">', webpage)
1959         if mobj is None:
1960             self._downloader.trouble(u'ERROR: unable to extract title')
1961             return
1962         file_title = mobj.group(1).decode('utf-8')
1963
1964         return [{
1965             'id':       file_id.decode('utf-8'),
1966             'url':      file_url.decode('utf-8'),
1967             'uploader': None,
1968             'upload_date':  None,
1969             'title':    file_title,
1970             'ext':      file_extension.decode('utf-8'),
1971         }]
1972
1973
1974 class FacebookIE(InfoExtractor):
1975     """Information Extractor for Facebook"""
1976
1977     _WORKING = False
1978     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1979     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1980     _NETRC_MACHINE = 'facebook'
1981     _available_formats = ['video', 'highqual', 'lowqual']
1982     _video_extensions = {
1983         'video': 'mp4',
1984         'highqual': 'mp4',
1985         'lowqual': 'mp4',
1986     }
1987     IE_NAME = u'facebook'
1988
1989     def __init__(self, downloader=None):
1990         InfoExtractor.__init__(self, downloader)
1991
1992     def _reporter(self, message):
1993         """Add header and report message."""
1994         self._downloader.to_screen(u'[facebook] %s' % message)
1995
1996     def report_login(self):
1997         """Report attempt to log in."""
1998         self._reporter(u'Logging in')
1999
2000     def report_video_webpage_download(self, video_id):
2001         """Report attempt to download video webpage."""
2002         self._reporter(u'%s: Downloading video webpage' % video_id)
2003
2004     def report_information_extraction(self, video_id):
2005         """Report attempt to extract video information."""
2006         self._reporter(u'%s: Extracting video information' % video_id)
2007
2008     def _parse_page(self, video_webpage):
2009         """Extract video information from page"""
2010         # General data
2011         data = {'title': r'\("video_title", "(.*?)"\)',
2012             'description': r'<div class="datawrap">(.*?)</div>',
2013             'owner': r'\("video_owner_name", "(.*?)"\)',
2014             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2015             }
2016         video_info = {}
2017         for piece in data.keys():
2018             mobj = re.search(data[piece], video_webpage)
2019             if mobj is not None:
2020                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2021
2022         # Video urls
2023         video_urls = {}
2024         for fmt in self._available_formats:
2025             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2026             if mobj is not None:
2027                 # URL is in a Javascript segment inside an escaped Unicode format within
2028                 # the generally utf-8 page
2029                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2030         video_info['video_urls'] = video_urls
2031
2032         return video_info
2033
2034     def _real_initialize(self):
2035         if self._downloader is None:
2036             return
2037
2038         useremail = None
2039         password = None
2040         downloader_params = self._downloader.params
2041
2042         # Attempt to use provided username and password or .netrc data
2043         if downloader_params.get('username', None) is not None:
2044             useremail = downloader_params['username']
2045             password = downloader_params['password']
2046         elif downloader_params.get('usenetrc', False):
2047             try:
2048                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2049                 if info is not None:
2050                     useremail = info[0]
2051                     password = info[2]
2052                 else:
2053                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2054             except (IOError, netrc.NetrcParseError) as err:
2055                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2056                 return
2057
2058         if useremail is None:
2059             return
2060
2061         # Log in
2062         login_form = {
2063             'email': useremail,
2064             'pass': password,
2065             'login': 'Log+In'
2066             }
2067         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2068         try:
2069             self.report_login()
2070             login_results = compat_urllib_request.urlopen(request).read()
2071             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2072                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2073                 return
2074         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2075             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2076             return
2077
2078     def _real_extract(self, url):
2079         mobj = re.match(self._VALID_URL, url)
2080         if mobj is None:
2081             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2082             return
2083         video_id = mobj.group('ID')
2084
2085         # Get video webpage
2086         self.report_video_webpage_download(video_id)
2087         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2088         try:
2089             page = compat_urllib_request.urlopen(request)
2090             video_webpage = page.read()
2091         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2092             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2093             return
2094
2095         # Start extracting information
2096         self.report_information_extraction(video_id)
2097
2098         # Extract information
2099         video_info = self._parse_page(video_webpage)
2100
2101         # uploader
2102         if 'owner' not in video_info:
2103             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2104             return
2105         video_uploader = video_info['owner']
2106
2107         # title
2108         if 'title' not in video_info:
2109             self._downloader.trouble(u'ERROR: unable to extract video title')
2110             return
2111         video_title = video_info['title']
2112         video_title = video_title.decode('utf-8')
2113
2114         # thumbnail image
2115         if 'thumbnail' not in video_info:
2116             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2117             video_thumbnail = ''
2118         else:
2119             video_thumbnail = video_info['thumbnail']
2120
2121         # upload date
2122         upload_date = None
2123         if 'upload_date' in video_info:
2124             upload_time = video_info['upload_date']
2125             timetuple = email.utils.parsedate_tz(upload_time)
2126             if timetuple is not None:
2127                 try:
2128                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2129                 except:
2130                     pass
2131
2132         # description
2133         video_description = video_info.get('description', 'No description available.')
2134
2135         url_map = video_info['video_urls']
2136         if url_map:
2137             # Decide which formats to download
2138             req_format = self._downloader.params.get('format', None)
2139             format_limit = self._downloader.params.get('format_limit', None)
2140
2141             if format_limit is not None and format_limit in self._available_formats:
2142                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2143             else:
2144                 format_list = self._available_formats
2145             existing_formats = [x for x in format_list if x in url_map]
2146             if len(existing_formats) == 0:
2147                 self._downloader.trouble(u'ERROR: no known formats available for video')
2148                 return
2149             if req_format is None:
2150                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2151             elif req_format == 'worst':
2152                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2153             elif req_format == '-1':
2154                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2155             else:
2156                 # Specific format
2157                 if req_format not in url_map:
2158                     self._downloader.trouble(u'ERROR: requested format not available')
2159                     return
2160                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2161
2162         results = []
2163         for format_param, video_real_url in video_url_list:
2164             # Extension
2165             video_extension = self._video_extensions.get(format_param, 'mp4')
2166
2167             results.append({
2168                 'id':       video_id.decode('utf-8'),
2169                 'url':      video_real_url.decode('utf-8'),
2170                 'uploader': video_uploader.decode('utf-8'),
2171                 'upload_date':  upload_date,
2172                 'title':    video_title,
2173                 'ext':      video_extension.decode('utf-8'),
2174                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2175                 'thumbnail':    video_thumbnail.decode('utf-8'),
2176                 'description':  video_description.decode('utf-8'),
2177             })
2178         return results
2179
2180 class BlipTVIE(InfoExtractor):
2181     """Information extractor for blip.tv"""
2182
2183     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2184     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2185     IE_NAME = u'blip.tv'
2186
2187     def report_extraction(self, file_id):
2188         """Report information extraction."""
2189         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2190
2191     def report_direct_download(self, title):
2192         """Report information extraction."""
2193         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2194
2195     def _real_extract(self, url):
2196         mobj = re.match(self._VALID_URL, url)
2197         if mobj is None:
2198             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2199             return
2200
2201         if '?' in url:
2202             cchar = '&'
2203         else:
2204             cchar = '?'
2205         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2206         request = compat_urllib_request.Request(json_url)
2207         self.report_extraction(mobj.group(1))
2208         info = None
2209         try:
2210             urlh = compat_urllib_request.urlopen(request)
2211             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2212                 basename = url.split('/')[-1]
2213                 title,ext = os.path.splitext(basename)
2214                 title = title.decode('UTF-8')
2215                 ext = ext.replace('.', '')
2216                 self.report_direct_download(title)
2217                 info = {
2218                     'id': title,
2219                     'url': url,
2220                     'uploader': None,
2221                     'upload_date': None,
2222                     'title': title,
2223                     'ext': ext,
2224                     'urlhandle': urlh
2225                 }
2226         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2227             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2228             return
2229         if info is None: # Regular URL
2230             try:
2231                 json_code_bytes = urlh.read()
2232                 json_code = json_code_bytes.decode('utf-8')
2233             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2234                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2235                 return
2236
2237             try:
2238                 json_data = json.loads(json_code)
2239                 if 'Post' in json_data:
2240                     data = json_data['Post']
2241                 else:
2242                     data = json_data
2243
2244                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2245                 video_url = data['media']['url']
2246                 umobj = re.match(self._URL_EXT, video_url)
2247                 if umobj is None:
2248                     raise ValueError('Can not determine filename extension')
2249                 ext = umobj.group(1)
2250
2251                 info = {
2252                     'id': data['item_id'],
2253                     'url': video_url,
2254                     'uploader': data['display_name'],
2255                     'upload_date': upload_date,
2256                     'title': data['title'],
2257                     'ext': ext,
2258                     'format': data['media']['mimeType'],
2259                     'thumbnail': data['thumbnailUrl'],
2260                     'description': data['description'],
2261                     'player_url': data['embedUrl']
2262                 }
2263             except (ValueError,KeyError) as err:
2264                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2265                 return
2266
2267         std_headers['User-Agent'] = 'iTunes/10.6.1'
2268         return [info]
2269
2270
2271 class MyVideoIE(InfoExtractor):
2272     """Information Extractor for myvideo.de."""
2273
2274     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2275     IE_NAME = u'myvideo'
2276
2277     def __init__(self, downloader=None):
2278         InfoExtractor.__init__(self, downloader)
2279
2280     def report_download_webpage(self, video_id):
2281         """Report webpage download."""
2282         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2283
2284     def report_extraction(self, video_id):
2285         """Report information extraction."""
2286         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2287
2288     def _real_extract(self,url):
2289         mobj = re.match(self._VALID_URL, url)
2290         if mobj is None:
2291             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2292             return
2293
2294         video_id = mobj.group(1)
2295
2296         # Get video webpage
2297         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2298         try:
2299             self.report_download_webpage(video_id)
2300             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2301         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2302             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2303             return
2304
2305         self.report_extraction(video_id)
2306         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2307                  webpage)
2308         if mobj is None:
2309             self._downloader.trouble(u'ERROR: unable to extract media URL')
2310             return
2311         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2312
2313         mobj = re.search('<title>([^<]+)</title>', webpage)
2314         if mobj is None:
2315             self._downloader.trouble(u'ERROR: unable to extract title')
2316             return
2317
2318         video_title = mobj.group(1)
2319
2320         return [{
2321             'id':       video_id,
2322             'url':      video_url,
2323             'uploader': None,
2324             'upload_date':  None,
2325             'title':    video_title,
2326             'ext':      u'flv',
2327         }]
2328
2329 class ComedyCentralIE(InfoExtractor):
2330     """Information extractor for The Daily Show and Colbert Report """
2331
2332     # urls can be abbreviations like :thedailyshow or :colbert
2333     # urls for episodes like:
2334     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2335     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2336     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2337     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2338                       |(https?://)?(www\.)?
2339                           (?P<showname>thedailyshow|colbertnation)\.com/
2340                          (full-episodes/(?P<episode>.*)|
2341                           (?P<clip>
2342                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2343                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2344                      $"""
2345     IE_NAME = u'comedycentral'
2346
2347     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2348
2349     _video_extensions = {
2350         '3500': 'mp4',
2351         '2200': 'mp4',
2352         '1700': 'mp4',
2353         '1200': 'mp4',
2354         '750': 'mp4',
2355         '400': 'mp4',
2356     }
2357     _video_dimensions = {
2358         '3500': '1280x720',
2359         '2200': '960x540',
2360         '1700': '768x432',
2361         '1200': '640x360',
2362         '750': '512x288',
2363         '400': '384x216',
2364     }
2365
2366     def suitable(self, url):
2367         """Receives a URL and returns True if suitable for this IE."""
2368         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2369
2370     def report_extraction(self, episode_id):
2371         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2372
2373     def report_config_download(self, episode_id):
2374         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2375
2376     def report_index_download(self, episode_id):
2377         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2378
2379     def report_player_url(self, episode_id):
2380         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2381
2382
2383     def _print_formats(self, formats):
2384         print('Available formats:')
2385         for x in formats:
2386             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2387
2388
2389     def _real_extract(self, url):
2390         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2391         if mobj is None:
2392             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2393             return
2394
2395         if mobj.group('shortname'):
2396             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2397                 url = u'http://www.thedailyshow.com/full-episodes/'
2398             else:
2399                 url = u'http://www.colbertnation.com/full-episodes/'
2400             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2401             assert mobj is not None
2402
2403         if mobj.group('clip'):
2404             if mobj.group('showname') == 'thedailyshow':
2405                 epTitle = mobj.group('tdstitle')
2406             else:
2407                 epTitle = mobj.group('cntitle')
2408             dlNewest = False
2409         else:
2410             dlNewest = not mobj.group('episode')
2411             if dlNewest:
2412                 epTitle = mobj.group('showname')
2413             else:
2414                 epTitle = mobj.group('episode')
2415
2416         req = compat_urllib_request.Request(url)
2417         self.report_extraction(epTitle)
2418         try:
2419             htmlHandle = compat_urllib_request.urlopen(req)
2420             html = htmlHandle.read()
2421         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2422             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2423             return
2424         if dlNewest:
2425             url = htmlHandle.geturl()
2426             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2427             if mobj is None:
2428                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2429                 return
2430             if mobj.group('episode') == '':
2431                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2432                 return
2433             epTitle = mobj.group('episode')
2434
2435         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2436
2437         if len(mMovieParams) == 0:
2438             # The Colbert Report embeds the information in a without
2439             # a URL prefix; so extract the alternate reference
2440             # and then add the URL prefix manually.
2441
2442             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2443             if len(altMovieParams) == 0:
2444                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2445                 return
2446             else:
2447                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2448
2449         playerUrl_raw = mMovieParams[0][0]
2450         self.report_player_url(epTitle)
2451         try:
2452             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2453             playerUrl = urlHandle.geturl()
2454         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2455             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2456             return
2457
2458         uri = mMovieParams[0][1]
2459         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2460         self.report_index_download(epTitle)
2461         try:
2462             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2463         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2464             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2465             return
2466
2467         results = []
2468
2469         idoc = xml.etree.ElementTree.fromstring(indexXml)
2470         itemEls = idoc.findall('.//item')
2471         for itemEl in itemEls:
2472             mediaId = itemEl.findall('./guid')[0].text
2473             shortMediaId = mediaId.split(':')[-1]
2474             showId = mediaId.split(':')[-2].replace('.com', '')
2475             officialTitle = itemEl.findall('./title')[0].text
2476             officialDate = itemEl.findall('./pubDate')[0].text
2477
2478             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2479                         compat_urllib_parse.urlencode({'uri': mediaId}))
2480             configReq = compat_urllib_request.Request(configUrl)
2481             self.report_config_download(epTitle)
2482             try:
2483                 configXml = compat_urllib_request.urlopen(configReq).read()
2484             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2486                 return
2487
2488             cdoc = xml.etree.ElementTree.fromstring(configXml)
2489             turls = []
2490             for rendition in cdoc.findall('.//rendition'):
2491                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2492                 turls.append(finfo)
2493
2494             if len(turls) == 0:
2495                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2496                 continue
2497
2498             if self._downloader.params.get('listformats', None):
2499                 self._print_formats([i[0] for i in turls])
2500                 return
2501
2502             # For now, just pick the highest bitrate
2503             format,video_url = turls[-1]
2504
2505             # Get the format arg from the arg stream
2506             req_format = self._downloader.params.get('format', None)
2507
2508             # Select format if we can find one
2509             for f,v in turls:
2510                 if f == req_format:
2511                     format, video_url = f, v
2512                     break
2513
2514             # Patch to download from alternative CDN, which does not
2515             # break on current RTMPDump builds
2516             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2517             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2518
2519             if video_url.startswith(broken_cdn):
2520                 video_url = video_url.replace(broken_cdn, better_cdn)
2521
2522             effTitle = showId + u'-' + epTitle
2523             info = {
2524                 'id': shortMediaId,
2525                 'url': video_url,
2526                 'uploader': showId,
2527                 'upload_date': officialDate,
2528                 'title': effTitle,
2529                 'ext': 'mp4',
2530                 'format': format,
2531                 'thumbnail': None,
2532                 'description': officialTitle,
2533                 'player_url': None #playerUrl
2534             }
2535
2536             results.append(info)
2537
2538         return results
2539
2540
2541 class EscapistIE(InfoExtractor):
2542     """Information extractor for The Escapist """
2543
2544     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2545     IE_NAME = u'escapist'
2546
2547     def report_extraction(self, showName):
2548         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2549
2550     def report_config_download(self, showName):
2551         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2552
2553     def _real_extract(self, url):
2554         mobj = re.match(self._VALID_URL, url)
2555         if mobj is None:
2556             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2557             return
2558         showName = mobj.group('showname')
2559         videoId = mobj.group('episode')
2560
2561         self.report_extraction(showName)
2562         try:
2563             webPage = compat_urllib_request.urlopen(url)
2564             webPageBytes = webPage.read()
2565             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2566             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2567         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2568             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2569             return
2570
2571         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2572         description = unescapeHTML(descMatch.group(1))
2573         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2574         imgUrl = unescapeHTML(imgMatch.group(1))
2575         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2576         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2577         configUrlMatch = re.search('config=(.*)$', playerUrl)
2578         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2579
2580         self.report_config_download(showName)
2581         try:
2582             configJSON = compat_urllib_request.urlopen(configUrl)
2583             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2584             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2585         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2586             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2587             return
2588
2589         # Technically, it's JavaScript, not JSON
2590         configJSON = configJSON.replace("'", '"')
2591
2592         try:
2593             config = json.loads(configJSON)
2594         except (ValueError,) as err:
2595             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2596             return
2597
2598         playlist = config['playlist']
2599         videoUrl = playlist[1]['url']
2600
2601         info = {
2602             'id': videoId,
2603             'url': videoUrl,
2604             'uploader': showName,
2605             'upload_date': None,
2606             'title': showName,
2607             'ext': 'flv',
2608             'thumbnail': imgUrl,
2609             'description': description,
2610             'player_url': playerUrl,
2611         }
2612
2613         return [info]
2614
2615
2616 class CollegeHumorIE(InfoExtractor):
2617     """Information extractor for collegehumor.com"""
2618
2619     _WORKING = False
2620     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2621     IE_NAME = u'collegehumor'
2622
2623     def report_manifest(self, video_id):
2624         """Report information extraction."""
2625         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2626
2627     def report_extraction(self, video_id):
2628         """Report information extraction."""
2629         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2630
2631     def _real_extract(self, url):
2632         mobj = re.match(self._VALID_URL, url)
2633         if mobj is None:
2634             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2635             return
2636         video_id = mobj.group('videoid')
2637
2638         info = {
2639             'id': video_id,
2640             'uploader': None,
2641             'upload_date': None,
2642         }
2643
2644         self.report_extraction(video_id)
2645         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2646         try:
2647             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2648         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2649             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2650             return
2651
2652         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2653         try:
2654             videoNode = mdoc.findall('./video')[0]
2655             info['description'] = videoNode.findall('./description')[0].text
2656             info['title'] = videoNode.findall('./caption')[0].text
2657             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2658             manifest_url = videoNode.findall('./file')[0].text
2659         except IndexError:
2660             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2661             return
2662
2663         manifest_url += '?hdcore=2.10.3'
2664         self.report_manifest(video_id)
2665         try:
2666             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2667         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2668             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2669             return
2670
2671         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2672         try:
2673             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2674             node_id = media_node.attrib['url']
2675             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2676         except IndexError as err:
2677             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2678             return
2679
2680         url_pr = compat_urllib_parse_urlparse(manifest_url)
2681         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2682
2683         info['url'] = url
2684         info['ext'] = 'f4f'
2685         return [info]
2686
2687
2688 class XVideosIE(InfoExtractor):
2689     """Information extractor for xvideos.com"""
2690
2691     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2692     IE_NAME = u'xvideos'
2693
2694     def report_webpage(self, video_id):
2695         """Report information extraction."""
2696         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2697
2698     def report_extraction(self, video_id):
2699         """Report information extraction."""
2700         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2701
2702     def _real_extract(self, url):
2703         mobj = re.match(self._VALID_URL, url)
2704         if mobj is None:
2705             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2706             return
2707         video_id = mobj.group(1)
2708
2709         self.report_webpage(video_id)
2710
2711         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2712         try:
2713             webpage_bytes = compat_urllib_request.urlopen(request).read()
2714             webpage = webpage_bytes.decode('utf-8', 'replace')
2715         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2716             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2717             return
2718
2719         self.report_extraction(video_id)
2720
2721
2722         # Extract video URL
2723         mobj = re.search(r'flv_url=(.+?)&', webpage)
2724         if mobj is None:
2725             self._downloader.trouble(u'ERROR: unable to extract video url')
2726             return
2727         video_url = compat_urllib_parse.unquote(mobj.group(1))
2728
2729
2730         # Extract title
2731         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2732         if mobj is None:
2733             self._downloader.trouble(u'ERROR: unable to extract video title')
2734             return
2735         video_title = mobj.group(1)
2736
2737
2738         # Extract video thumbnail
2739         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2740         if mobj is None:
2741             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2742             return
2743         video_thumbnail = mobj.group(0)
2744
2745         info = {
2746             'id': video_id,
2747             'url': video_url,
2748             'uploader': None,
2749             'upload_date': None,
2750             'title': video_title,
2751             'ext': 'flv',
2752             'thumbnail': video_thumbnail,
2753             'description': None,
2754         }
2755
2756         return [info]
2757
2758
2759 class SoundcloudIE(InfoExtractor):
2760     """Information extractor for soundcloud.com
2761        To access the media, the uid of the song and a stream token
2762        must be extracted from the page source and the script must make
2763        a request to media.soundcloud.com/crossdomain.xml. Then
2764        the media can be grabbed by requesting from an url composed
2765        of the stream token and uid
2766      """
2767
2768     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2769     IE_NAME = u'soundcloud'
2770
2771     def __init__(self, downloader=None):
2772         InfoExtractor.__init__(self, downloader)
2773
2774     def report_resolve(self, video_id):
2775         """Report information extraction."""
2776         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2777
2778     def report_extraction(self, video_id):
2779         """Report information extraction."""
2780         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2781
2782     def _real_extract(self, url):
2783         mobj = re.match(self._VALID_URL, url)
2784         if mobj is None:
2785             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2786             return
2787
2788         # extract uploader (which is in the url)
2789         uploader = mobj.group(1)
2790         # extract simple title (uploader + slug of song title)
2791         slug_title =  mobj.group(2)
2792         simple_title = uploader + u'-' + slug_title
2793
2794         self.report_resolve('%s/%s' % (uploader, slug_title))
2795
2796         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2797         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2798         request = compat_urllib_request.Request(resolv_url)
2799         try:
2800             info_json_bytes = compat_urllib_request.urlopen(request).read()
2801             info_json = info_json_bytes.decode('utf-8')
2802         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2803             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2804             return
2805
2806         info = json.loads(info_json)
2807         video_id = info['id']
2808         self.report_extraction('%s/%s' % (uploader, slug_title))
2809
2810         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2811         request = compat_urllib_request.Request(streams_url)
2812         try:
2813             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2814             stream_json = stream_json_bytes.decode('utf-8')
2815         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2816             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2817             return
2818
2819         streams = json.loads(stream_json)
2820         mediaURL = streams['http_mp3_128_url']
2821
2822         return [{
2823             'id':       info['id'],
2824             'url':      mediaURL,
2825             'uploader': info['user']['username'],
2826             'upload_date':  info['created_at'],
2827             'title':    info['title'],
2828             'ext':      u'mp3',
2829             'description': info['description'],
2830         }]
2831
2832
2833 class InfoQIE(InfoExtractor):
2834     """Information extractor for infoq.com"""
2835
2836     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2837     IE_NAME = u'infoq'
2838
2839     def report_webpage(self, video_id):
2840         """Report information extraction."""
2841         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2842
2843     def report_extraction(self, video_id):
2844         """Report information extraction."""
2845         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2846
2847     def _real_extract(self, url):
2848         mobj = re.match(self._VALID_URL, url)
2849         if mobj is None:
2850             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2851             return
2852
2853         self.report_webpage(url)
2854
2855         request = compat_urllib_request.Request(url)
2856         try:
2857             webpage = compat_urllib_request.urlopen(request).read()
2858         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2859             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2860             return
2861
2862         self.report_extraction(url)
2863
2864
2865         # Extract video URL
2866         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2867         if mobj is None:
2868             self._downloader.trouble(u'ERROR: unable to extract video url')
2869             return
2870         video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2871
2872
2873         # Extract title
2874         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2875         if mobj is None:
2876             self._downloader.trouble(u'ERROR: unable to extract video title')
2877             return
2878         video_title = mobj.group(1).decode('utf-8')
2879
2880         # Extract description
2881         video_description = u'No description available.'
2882         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2883         if mobj is not None:
2884             video_description = mobj.group(1).decode('utf-8')
2885
2886         video_filename = video_url.split('/')[-1]
2887         video_id, extension = video_filename.split('.')
2888
2889         info = {
2890             'id': video_id,
2891             'url': video_url,
2892             'uploader': None,
2893             'upload_date': None,
2894             'title': video_title,
2895             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2896             'thumbnail': None,
2897             'description': video_description,
2898         }
2899
2900         return [info]
2901
2902 class MixcloudIE(InfoExtractor):
2903     """Information extractor for www.mixcloud.com"""
2904
2905     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2906     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2907     IE_NAME = u'mixcloud'
2908
2909     def __init__(self, downloader=None):
2910         InfoExtractor.__init__(self, downloader)
2911
2912     def report_download_json(self, file_id):
2913         """Report JSON download."""
2914         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2915
2916     def report_extraction(self, file_id):
2917         """Report information extraction."""
2918         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2919
2920     def get_urls(self, jsonData, fmt, bitrate='best'):
2921         """Get urls from 'audio_formats' section in json"""
2922         file_url = None
2923         try:
2924             bitrate_list = jsonData[fmt]
2925             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2926                 bitrate = max(bitrate_list) # select highest
2927
2928             url_list = jsonData[fmt][bitrate]
2929         except TypeError: # we have no bitrate info.
2930             url_list = jsonData[fmt]
2931         return url_list
2932
2933     def check_urls(self, url_list):
2934         """Returns 1st active url from list"""
2935         for url in url_list:
2936             try:
2937                 compat_urllib_request.urlopen(url)
2938                 return url
2939             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2940                 url = None
2941
2942         return None
2943
2944     def _print_formats(self, formats):
2945         print('Available formats:')
2946         for fmt in formats.keys():
2947             for b in formats[fmt]:
2948                 try:
2949                     ext = formats[fmt][b][0]
2950                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2951                 except TypeError: # we have no bitrate info
2952                     ext = formats[fmt][0]
2953                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2954                     break
2955
2956     def _real_extract(self, url):
2957         mobj = re.match(self._VALID_URL, url)
2958         if mobj is None:
2959             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2960             return
2961         # extract uploader & filename from url
2962         uploader = mobj.group(1).decode('utf-8')
2963         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2964
2965         # construct API request
2966         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2967         # retrieve .json file with links to files
2968         request = compat_urllib_request.Request(file_url)
2969         try:
2970             self.report_download_json(file_url)
2971             jsonData = compat_urllib_request.urlopen(request).read()
2972         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2973             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2974             return
2975
2976         # parse JSON
2977         json_data = json.loads(jsonData)
2978         player_url = json_data['player_swf_url']
2979         formats = dict(json_data['audio_formats'])
2980
2981         req_format = self._downloader.params.get('format', None)
2982         bitrate = None
2983
2984         if self._downloader.params.get('listformats', None):
2985             self._print_formats(formats)
2986             return
2987
2988         if req_format is None or req_format == 'best':
2989             for format_param in formats.keys():
2990                 url_list = self.get_urls(formats, format_param)
2991                 # check urls
2992                 file_url = self.check_urls(url_list)
2993                 if file_url is not None:
2994                     break # got it!
2995         else:
2996             if req_format not in formats:
2997                 self._downloader.trouble(u'ERROR: format is not available')
2998                 return
2999
3000             url_list = self.get_urls(formats, req_format)
3001             file_url = self.check_urls(url_list)
3002             format_param = req_format
3003
3004         return [{
3005             'id': file_id.decode('utf-8'),
3006             'url': file_url.decode('utf-8'),
3007             'uploader': uploader.decode('utf-8'),
3008             'upload_date': None,
3009             'title': json_data['name'],
3010             'ext': file_url.split('.')[-1].decode('utf-8'),
3011             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3012             'thumbnail': json_data['thumbnail_url'],
3013             'description': json_data['description'],
3014             'player_url': player_url.decode('utf-8'),
3015         }]
3016
3017 class StanfordOpenClassroomIE(InfoExtractor):
3018     """Information extractor for Stanford's Open ClassRoom"""
3019
3020     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3021     IE_NAME = u'stanfordoc'
3022
3023     def report_download_webpage(self, objid):
3024         """Report information extraction."""
3025         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3026
3027     def report_extraction(self, video_id):
3028         """Report information extraction."""
3029         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3030
3031     def _real_extract(self, url):
3032         mobj = re.match(self._VALID_URL, url)
3033         if mobj is None:
3034             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3035             return
3036
3037         if mobj.group('course') and mobj.group('video'): # A specific video
3038             course = mobj.group('course')
3039             video = mobj.group('video')
3040             info = {
3041                 'id': course + '_' + video,
3042                 'uploader': None,
3043                 'upload_date': None,
3044             }
3045
3046             self.report_extraction(info['id'])
3047             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3048             xmlUrl = baseUrl + video + '.xml'
3049             try:
3050                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3051             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3052                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3053                 return
3054             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3055             try:
3056                 info['title'] = mdoc.findall('./title')[0].text
3057                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3058             except IndexError:
3059                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3060                 return
3061             info['ext'] = info['url'].rpartition('.')[2]
3062             return [info]
3063         elif mobj.group('course'): # A course page
3064             course = mobj.group('course')
3065             info = {
3066                 'id': course,
3067                 'type': 'playlist',
3068                 'uploader': None,
3069                 'upload_date': None,
3070             }
3071
3072             self.report_download_webpage(info['id'])
3073             try:
3074                 coursepage = compat_urllib_request.urlopen(url).read()
3075             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3076                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3077                 return
3078
3079             m = re.search('<h1>([^<]+)</h1>', coursepage)
3080             if m:
3081                 info['title'] = unescapeHTML(m.group(1))
3082             else:
3083                 info['title'] = info['id']
3084
3085             m = re.search('<description>([^<]+)</description>', coursepage)
3086             if m:
3087                 info['description'] = unescapeHTML(m.group(1))
3088
3089             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3090             info['list'] = [
3091                 {
3092                     'type': 'reference',
3093                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3094                 }
3095                     for vpage in links]
3096             results = []
3097             for entry in info['list']:
3098                 assert entry['type'] == 'reference'
3099                 results += self.extract(entry['url'])
3100             return results
3101
3102         else: # Root page
3103             info = {
3104                 'id': 'Stanford OpenClassroom',
3105                 'type': 'playlist',
3106                 'uploader': None,
3107                 'upload_date': None,
3108             }
3109
3110             self.report_download_webpage(info['id'])
3111             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3112             try:
3113                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3114             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3115                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3116                 return
3117
3118             info['title'] = info['id']
3119
3120             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3121             info['list'] = [
3122                 {
3123                     'type': 'reference',
3124                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3125                 }
3126                     for cpage in links]
3127
3128             results = []
3129             for entry in info['list']:
3130                 assert entry['type'] == 'reference'
3131                 results += self.extract(entry['url'])
3132             return results
3133
3134 class MTVIE(InfoExtractor):
3135     """Information extractor for MTV.com"""
3136
3137     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3138     IE_NAME = u'mtv'
3139
3140     def report_webpage(self, video_id):
3141         """Report information extraction."""
3142         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3143
3144     def report_extraction(self, video_id):
3145         """Report information extraction."""
3146         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3147
3148     def _real_extract(self, url):
3149         mobj = re.match(self._VALID_URL, url)
3150         if mobj is None:
3151             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3152             return
3153         if not mobj.group('proto'):
3154             url = 'http://' + url
3155         video_id = mobj.group('videoid')
3156         self.report_webpage(video_id)
3157
3158         request = compat_urllib_request.Request(url)
3159         try:
3160             webpage = compat_urllib_request.urlopen(request).read()
3161         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3162             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3163             return
3164
3165         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3166         if mobj is None:
3167             self._downloader.trouble(u'ERROR: unable to extract song name')
3168             return
3169         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3170         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3171         if mobj is None:
3172             self._downloader.trouble(u'ERROR: unable to extract performer')
3173             return
3174         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3175         video_title = performer + ' - ' + song_name
3176
3177         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3178         if mobj is None:
3179             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3180             return
3181         mtvn_uri = mobj.group(1)
3182
3183         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3184         if mobj is None:
3185             self._downloader.trouble(u'ERROR: unable to extract content id')
3186             return
3187         content_id = mobj.group(1)
3188
3189         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3190         self.report_extraction(video_id)
3191         request = compat_urllib_request.Request(videogen_url)
3192         try:
3193             metadataXml = compat_urllib_request.urlopen(request).read()
3194         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3195             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3196             return
3197
3198         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3199         renditions = mdoc.findall('.//rendition')
3200
3201         # For now, always pick the highest quality.
3202         rendition = renditions[-1]
3203
3204         try:
3205             _,_,ext = rendition.attrib['type'].partition('/')
3206             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3207             video_url = rendition.find('./src').text
3208         except KeyError:
3209             self._downloader.trouble('Invalid rendition field.')
3210             return
3211
3212         info = {
3213             'id': video_id,
3214             'url': video_url,
3215             'uploader': performer,
3216             'upload_date': None,
3217             'title': video_title,
3218             'ext': ext,
3219             'format': format,
3220         }
3221
3222         return [info]
3223
3224
3225 class YoukuIE(InfoExtractor):
3226
3227     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3228     IE_NAME = u'Youku'
3229
3230     def __init__(self, downloader=None):
3231         InfoExtractor.__init__(self, downloader)
3232
3233     def report_download_webpage(self, file_id):
3234         """Report webpage download."""
3235         self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3236
3237     def report_extraction(self, file_id):
3238         """Report information extraction."""
3239         self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3240
3241     def _gen_sid(self):
3242         nowTime = int(time.time() * 1000)
3243         random1 = random.randint(1000,1998)
3244         random2 = random.randint(1000,9999)
3245
3246         return "%d%d%d" %(nowTime,random1,random2)
3247
3248     def _get_file_ID_mix_string(self, seed):
3249         mixed = []
3250         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3251         seed = float(seed)
3252         for i in range(len(source)):
3253             seed  =  (seed * 211 + 30031 ) % 65536
3254             index  =  math.floor(seed / 65536 * len(source) )
3255             mixed.append(source[int(index)])
3256             source.remove(source[int(index)])
3257         #return ''.join(mixed)
3258         return mixed
3259
3260     def _get_file_id(self, fileId, seed):
3261         mixed = self._get_file_ID_mix_string(seed)
3262         ids = fileId.split('*')
3263         realId = []
3264         for ch in ids:
3265             if ch:
3266                 realId.append(mixed[int(ch)])
3267         return ''.join(realId)
3268
3269     def _real_extract(self, url):
3270         mobj = re.match(self._VALID_URL, url)
3271         if mobj is None:
3272             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3273             return
3274         video_id = mobj.group('ID')
3275
3276         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3277
3278         request = compat_urllib_request.Request(info_url, None, std_headers)
3279         try:
3280             self.report_download_webpage(video_id)
3281             jsondata = compat_urllib_request.urlopen(request).read()
3282         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3283             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3284             return
3285
3286         self.report_extraction(video_id)
3287         try:
3288             jsonstr = jsondata.decode('utf-8')
3289             config = json.loads(jsonstr)
3290
3291             video_title =  config['data'][0]['title']
3292             seed = config['data'][0]['seed']
3293
3294             format = self._downloader.params.get('format', None)
3295             supported_format = list(config['data'][0]['streamfileids'].keys())
3296
3297             if format is None or format == 'best':
3298                 if 'hd2' in supported_format:
3299                     format = 'hd2'
3300                 else:
3301                     format = 'flv'
3302                 ext = u'flv'
3303             elif format == 'worst':
3304                 format = 'mp4'
3305                 ext = u'mp4'
3306             else:
3307                 format = 'flv'
3308                 ext = u'flv'
3309
3310
3311             fileid = config['data'][0]['streamfileids'][format]
3312             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3313         except (UnicodeDecodeError, ValueError, KeyError):
3314             self._downloader.trouble(u'ERROR: unable to extract info section')
3315             return
3316
3317         files_info=[]
3318         sid = self._gen_sid()
3319         fileid = self._get_file_id(fileid, seed)
3320
3321         #column 8,9 of fileid represent the segment number
3322         #fileid[7:9] should be changed
3323         for index, key in enumerate(keys):
3324
3325             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3326             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3327
3328             info = {
3329                 'id': '%s_part%02d' % (video_id, index),
3330                 'url': download_url,
3331                 'uploader': None,
3332                 'upload_date': None,
3333                 'title': video_title,
3334                 'ext': ext,
3335             }
3336             files_info.append(info)
3337
3338         return files_info
3339
3340
3341 class XNXXIE(InfoExtractor):
3342     """Information extractor for xnxx.com"""
3343
3344     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3345     IE_NAME = u'xnxx'
3346     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3347     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3348     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3349
3350     def report_webpage(self, video_id):
3351         """Report information extraction"""
3352         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3353
3354     def report_extraction(self, video_id):
3355         """Report information extraction"""
3356         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3357
3358     def _real_extract(self, url):
3359         mobj = re.match(self._VALID_URL, url)
3360         if mobj is None:
3361             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3362             return
3363         video_id = mobj.group(1)
3364
3365         self.report_webpage(video_id)
3366
3367         # Get webpage content
3368         try:
3369             webpage_bytes = compat_urllib_request.urlopen(url).read()
3370             webpage = webpage_bytes.decode('utf-8')
3371         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3372             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3373             return
3374
3375         result = re.search(self.VIDEO_URL_RE, webpage)
3376         if result is None:
3377             self._downloader.trouble(u'ERROR: unable to extract video url')
3378             return
3379         video_url = compat_urllib_parse.unquote(result.group(1))
3380
3381         result = re.search(self.VIDEO_TITLE_RE, webpage)
3382         if result is None:
3383             self._downloader.trouble(u'ERROR: unable to extract video title')
3384             return
3385         video_title = result.group(1)
3386
3387         result = re.search(self.VIDEO_THUMB_RE, webpage)
3388         if result is None:
3389             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3390             return
3391         video_thumbnail = result.group(1)
3392
3393         return [{
3394             'id': video_id,
3395             'url': video_url,
3396             'uploader': None,
3397             'upload_date': None,
3398             'title': video_title,
3399             'ext': 'flv',
3400             'thumbnail': video_thumbnail,
3401             'description': None,
3402         }]
3403
3404
3405 class GooglePlusIE(InfoExtractor):
3406     """Information extractor for plus.google.com."""
3407
3408     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3409     IE_NAME = u'plus.google'
3410
3411     def __init__(self, downloader=None):
3412         InfoExtractor.__init__(self, downloader)
3413
3414     def report_extract_entry(self, url):
3415         """Report downloading extry"""
3416         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3417
3418     def report_date(self, upload_date):
3419         """Report downloading extry"""
3420         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3421
3422     def report_uploader(self, uploader):
3423         """Report downloading extry"""
3424         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3425
3426     def report_title(self, video_title):
3427         """Report downloading extry"""
3428         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3429
3430     def report_extract_vid_page(self, video_page):
3431         """Report information extraction."""
3432         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3433
3434     def _real_extract(self, url):
3435         # Extract id from URL
3436         mobj = re.match(self._VALID_URL, url)
3437         if mobj is None:
3438             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3439             return
3440
3441         post_url = mobj.group(0)
3442         video_id = mobj.group(1)
3443
3444         video_extension = 'flv'
3445
3446         # Step 1, Retrieve post webpage to extract further information
3447         self.report_extract_entry(post_url)
3448         request = compat_urllib_request.Request(post_url)
3449         try:
3450             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3451         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3452             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3453             return
3454
3455         # Extract update date
3456         upload_date = None
3457         pattern = 'title="Timestamp">(.*?)</a>'
3458         mobj = re.search(pattern, webpage)
3459         if mobj:
3460             upload_date = mobj.group(1)
3461             # Convert timestring to a format suitable for filename
3462             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3463             upload_date = upload_date.strftime('%Y%m%d')
3464         self.report_date(upload_date)
3465
3466         # Extract uploader
3467         uploader = None
3468         pattern = r'rel\="author".*?>(.*?)</a>'
3469         mobj = re.search(pattern, webpage)
3470         if mobj:
3471             uploader = mobj.group(1)
3472         self.report_uploader(uploader)
3473
3474         # Extract title
3475         # Get the first line for title
3476         video_title = u'NA'
3477         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3478         mobj = re.search(pattern, webpage)
3479         if mobj:
3480             video_title = mobj.group(1)
3481         self.report_title(video_title)
3482
3483         # Step 2, Stimulate clicking the image box to launch video
3484         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3485         mobj = re.search(pattern, webpage)
3486         if mobj is None:
3487             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3488
3489         video_page = mobj.group(1)
3490         request = compat_urllib_request.Request(video_page)
3491         try:
3492             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3493         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3494             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3495             return
3496         self.report_extract_vid_page(video_page)
3497
3498
3499         # Extract video links on video page
3500         """Extract video links of all sizes"""
3501         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3502         mobj = re.findall(pattern, webpage)
3503         if len(mobj) == 0:
3504             self._downloader.trouble(u'ERROR: unable to extract video links')
3505
3506         # Sort in resolution
3507         links = sorted(mobj)
3508
3509         # Choose the lowest of the sort, i.e. highest resolution
3510         video_url = links[-1]
3511         # Only get the url. The resolution part in the tuple has no use anymore
3512         video_url = video_url[-1]
3513         # Treat escaped \u0026 style hex
3514         try:
3515             video_url = video_url.decode("unicode_escape")
3516         except AttributeError: # Python 3
3517             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3518
3519
3520         return [{
3521             'id':       video_id,
3522             'url':      video_url,
3523             'uploader': uploader,
3524             'upload_date':  upload_date,
3525             'title':    video_title,
3526             'ext':      video_extension,
3527         }]
3528
3529 class NBAIE(InfoExtractor):
3530     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3531     IE_NAME = u'nba'
3532
3533     def report_extraction(self, video_id):
3534         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3535
3536     def _real_extract(self, url):
3537         mobj = re.match(self._VALID_URL, url)
3538         if mobj is None:
3539             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3540             return
3541
3542         video_id = mobj.group(1)
3543         if video_id.endswith('/index.html'):
3544             video_id = video_id[:-len('/index.html')]
3545
3546         self.report_extraction(video_id)
3547         try:
3548             urlh = compat_urllib_request.urlopen(url)
3549             webpage_bytes = urlh.read()
3550             webpage = webpage_bytes.decode('utf-8', 'ignore')
3551         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3552             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3553             return
3554
3555         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3556         def _findProp(rexp, default=None):
3557             m = re.search(rexp, webpage)
3558             if m:
3559                 return unescapeHTML(m.group(1))
3560             else:
3561                 return default
3562
3563         shortened_video_id = video_id.rpartition('/')[2]
3564         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3565         info = {
3566             'id': shortened_video_id,
3567             'url': video_url,
3568             'ext': 'mp4',
3569             'title': title,
3570             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3571             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3572         }
3573         return [info]
3574
3575 class JustinTVIE(InfoExtractor):
3576     """Information extractor for justin.tv and twitch.tv"""
3577     # TODO: One broadcast may be split into multiple videos. The key
3578     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3579     # starts at 1 and increases. Can we treat all parts as one video?
3580
3581     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3582         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3583     _JUSTIN_PAGE_LIMIT = 100
3584     IE_NAME = u'justin.tv'
3585
3586     def report_extraction(self, file_id):
3587         """Report information extraction."""
3588         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3589
3590     def report_download_page(self, channel, offset):
3591         """Report attempt to download a single page of videos."""
3592         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3593                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3594
3595     # Return count of items, list of *valid* items
3596     def _parse_page(self, url):
3597         try:
3598             urlh = compat_urllib_request.urlopen(url)
3599             webpage_bytes = urlh.read()
3600             webpage = webpage_bytes.decode('utf-8', 'ignore')
3601         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3602             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3603             return
3604
3605         response = json.loads(webpage)
3606         info = []
3607         for clip in response:
3608             video_url = clip['video_file_url']
3609             if video_url:
3610                 video_extension = os.path.splitext(video_url)[1][1:]
3611                 video_date = re.sub('-', '', clip['created_on'][:10])
3612                 info.append({
3613                     'id': clip['id'],
3614                     'url': video_url,
3615                     'title': clip['title'],
3616                     'uploader': clip.get('user_id', clip.get('channel_id')),
3617                     'upload_date': video_date,
3618                     'ext': video_extension,
3619                 })
3620         return (len(response), info)
3621
3622     def _real_extract(self, url):
3623         mobj = re.match(self._VALID_URL, url)
3624         if mobj is None:
3625             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3626             return
3627
3628         api = 'http://api.justin.tv'
3629         video_id = mobj.group(mobj.lastindex)
3630         paged = False
3631         if mobj.lastindex == 1:
3632             paged = True
3633             api += '/channel/archives/%s.json'
3634         else:
3635             api += '/clip/show/%s.json'
3636         api = api % (video_id,)
3637
3638         self.report_extraction(video_id)
3639
3640         info = []
3641         offset = 0
3642         limit = self._JUSTIN_PAGE_LIMIT
3643         while True:
3644             if paged:
3645                 self.report_download_page(video_id, offset)
3646             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3647             page_count, page_info = self._parse_page(page_url)
3648             info.extend(page_info)
3649             if not paged or page_count != limit:
3650                 break
3651             offset += limit
3652         return info
3653
3654 class FunnyOrDieIE(InfoExtractor):
3655     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3656     IE_NAME = u'FunnyOrDie'
3657
3658     def report_extraction(self, video_id):
3659         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3660
3661     def _real_extract(self, url):
3662         mobj = re.match(self._VALID_URL, url)
3663         if mobj is None:
3664             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3665             return
3666
3667         video_id = mobj.group('id')
3668         self.report_extraction(video_id)
3669         try:
3670             urlh = compat_urllib_request.urlopen(url)
3671             webpage_bytes = urlh.read()
3672             webpage = webpage_bytes.decode('utf-8', 'ignore')
3673         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3674             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3675             return
3676
3677         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3678         if not m:
3679             self._downloader.trouble(u'ERROR: unable to find video information')
3680         video_url = unescapeHTML(m.group('url'))
3681
3682         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3683         if not m:
3684             self._downloader.trouble(u'Cannot find video title')
3685         title = unescapeHTML(m.group('title'))
3686
3687         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3688         if m:
3689             desc = unescapeHTML(m.group('desc'))
3690         else:
3691             desc = None
3692
3693         info = {
3694             'id': video_id,
3695             'url': video_url,
3696             'ext': 'mp4',
3697             'title': title,
3698             'description': desc,
3699         }
3700         return [info]
3701
3702 class TweetReelIE(InfoExtractor):
3703     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3704
3705     def report_extraction(self, video_id):
3706         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3707
3708     def _real_extract(self, url):
3709         mobj = re.match(self._VALID_URL, url)
3710         if mobj is None:
3711             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3712             return
3713
3714         video_id = mobj.group('id')
3715         self.report_extraction(video_id)
3716         try:
3717             urlh = compat_urllib_request.urlopen(url)
3718             webpage_bytes = urlh.read()
3719             webpage = webpage_bytes.decode('utf-8', 'ignore')
3720         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3721             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3722             return
3723
3724         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3725         if not m:
3726             self._downloader.trouble(u'ERROR: Cannot find status ID')
3727         status_id = m.group(1)
3728
3729         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3730         if not m:
3731             self._downloader.trouble(u'WARNING: Cannot find description')
3732         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3733
3734         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3735         if not m:
3736             self._downloader.trouble(u'ERROR: Cannot find uploader')
3737         uploader = unescapeHTML(m.group('uploader'))
3738         uploader_id = unescapeHTML(m.group('uploader_id'))
3739
3740         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3741         if not m:
3742             self._downloader.trouble(u'ERROR: Cannot find upload date')
3743         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3744
3745         title = desc
3746         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3747
3748         info = {
3749             'id': video_id,
3750             'url': video_url,
3751             'ext': 'mov',
3752             'title': title,
3753             'description': desc,
3754             'uploader': uploader,
3755             'uploader_id': uploader_id,
3756             'internal_id': status_id,
3757             'upload_date': upload_date
3758         }
3759         return [info]