youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import netrc
   9 import os
  10 import re
  11 import socket
  12 import time
  13 import email.utils
  14 import xml.etree.ElementTree
  15 import random
  16 import math
  17
  18 from .utils import *
  19
  20
  21 class InfoExtractor(object):
  22     """Information Extractor class.
  23
  24     Information extractors are the classes that, given a URL, extract
  25     information about the video (or videos) the URL refers to. This
  26     information includes the real video URL, the video title, author and
  27     others. The information is stored in a dictionary which is then
  28     passed to the FileDownloader. The FileDownloader processes this
  29     information possibly downloading the video to the file system, among
  30     other possible outcomes.
  31
  32     The dictionaries must include the following fields:
  33
  34     id:             Video identifier.
  35     url:            Final video URL.
  36     title:          Video title, unescaped.
  37     ext:            Video filename extension.
  38
  39     The following fields are optional:
  40
  41     format:         The video format, defaults to ext (used for --get-format)
  42     thumbnail:      Full URL to a video thumbnail image.
  43     description:    One-line video description.
  44     uploader:       Full name of the video uploader.
  45     upload_date:    Video upload date (YYYYMMDD).
  46     uploader_id:    Nickname or id of the video uploader.
  47     location:       Physical location of the video.
  48     player_url:     SWF Player URL (used for rtmpdump).
  49     subtitles:      The .srt file contents.
  50     urlhandle:      [internal] The urlHandle to be used to download the file,
  51                     like returned by urllib.request.urlopen
  52
  53     The fields should all be Unicode strings.
  54
  55     Subclasses of this one should re-define the _real_initialize() and
  56     _real_extract() methods and define a _VALID_URL regexp.
  57     Probably, they should also be added to the list of extractors.
  58
  59     _real_extract() must return a *list* of information dictionaries as
  60     described above.
  61
  62     Finally, the _WORKING attribute should be set to False for broken IEs
  63     in order to warn the users and skip the tests.
  64     """
  65
  66     _ready = False
  67     _downloader = None
  68     _WORKING = True
  69
  70     def __init__(self, downloader=None):
  71         """Constructor. Receives an optional downloader."""
  72         self._ready = False
  73         self.set_downloader(downloader)
  74
  75     def suitable(self, url):
  76         """Receives a URL and returns True if suitable for this IE."""
  77         return re.match(self._VALID_URL, url) is not None
  78
  79     def working(self):
  80         """Getter method for _WORKING."""
  81         return self._WORKING
  82
  83     def initialize(self):
  84         """Initializes an instance (authentication, etc)."""
  85         if not self._ready:
  86             self._real_initialize()
  87             self._ready = True
  88
  89     def extract(self, url):
  90         """Extracts URL information and returns it in list of dicts."""
  91         self.initialize()
  92         return self._real_extract(url)
  93
  94     def set_downloader(self, downloader):
  95         """Sets the downloader for this IE."""
  96         self._downloader = downloader
  97
  98     def _real_initialize(self):
  99         """Real initialization process. Redefine in subclasses."""
 100         pass
 101
 102     def _real_extract(self, url):
 103         """Real extraction process. Redefine in subclasses."""
 104         pass
 105
 106     @property
 107     def IE_NAME(self):
 108         return type(self).__name__[:-2]
 109
 110     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 111         """ Returns the response handle """
 112         if note is None:
 113             note = u'Downloading video webpage'
 114         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 115         try:
 116             return compat_urllib_request.urlopen(url_or_request)
 117         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 118             if errnote is None:
 119                 errnote = u'Unable to download webpage'
 120             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 121
 122     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 123         """ Returns the data of the page as a string """
 124         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 125         webpage_bytes = urlh.read()
 126         return webpage_bytes.decode('utf-8', 'replace')
 127
 128
 129 class YoutubeIE(InfoExtractor):
 130     """Information extractor for youtube.com."""
 131
 132     _VALID_URL = r"""^
 133                      (
 134                          (?:https?://)?                                       # http(s):// (optional)
 135                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 136                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 137                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 138                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 139                          (?:                                                  # the various things that can precede the ID:
 140                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 141                              |(?:                                             # or the v= param in all its forms
 142                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 143                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 144                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 145                                  v=
 146                              )
 147                          )?                                                   # optional -> youtube.com/xxxx is OK
 148                      )?                                                       # all until now is optional -> you can pass the naked ID
 149                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 150                      (?(1).+)?                                                # if we found the ID, everything can follow
 151                      $"""
 152     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 153     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 154     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 155     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 156     _NETRC_MACHINE = 'youtube'
 157     # Listed in order of quality
 158     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 159     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 160     _video_extensions = {
 161         '13': '3gp',
 162         '17': 'mp4',
 163         '18': 'mp4',
 164         '22': 'mp4',
 165         '37': 'mp4',
 166         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 167         '43': 'webm',
 168         '44': 'webm',
 169         '45': 'webm',
 170         '46': 'webm',
 171     }
 172     _video_dimensions = {
 173         '5': '240x400',
 174         '6': '???',
 175         '13': '???',
 176         '17': '144x176',
 177         '18': '360x640',
 178         '22': '720x1280',
 179         '34': '360x640',
 180         '35': '480x854',
 181         '37': '1080x1920',
 182         '38': '3072x4096',
 183         '43': '360x640',
 184         '44': '480x854',
 185         '45': '720x1280',
 186         '46': '1080x1920',
 187     }
 188     IE_NAME = u'youtube'
 189
 190     def suitable(self, url):
 191         """Receives a URL and returns True if suitable for this IE."""
 192         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 193
 194     def report_lang(self):
 195         """Report attempt to set language."""
 196         self._downloader.to_screen(u'[youtube] Setting language')
 197
 198     def report_login(self):
 199         """Report attempt to log in."""
 200         self._downloader.to_screen(u'[youtube] Logging in')
 201
 202     def report_age_confirmation(self):
 203         """Report attempt to confirm age."""
 204         self._downloader.to_screen(u'[youtube] Confirming age')
 205
 206     def report_video_webpage_download(self, video_id):
 207         """Report attempt to download video webpage."""
 208         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 209
 210     def report_video_info_webpage_download(self, video_id):
 211         """Report attempt to download video info webpage."""
 212         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 213
 214     def report_video_subtitles_download(self, video_id):
 215         """Report attempt to download video info webpage."""
 216         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 217
 218     def report_information_extraction(self, video_id):
 219         """Report attempt to extract video information."""
 220         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 221
 222     def report_unavailable_format(self, video_id, format):
 223         """Report extracted video URL."""
 224         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 225
 226     def report_rtmp_download(self):
 227         """Indicate the download will use the RTMP protocol."""
 228         self._downloader.to_screen(u'[youtube] RTMP download detected')
 229
 230     def _closed_captions_xml_to_srt(self, xml_string):
 231         srt = ''
 232         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 233         # TODO parse xml instead of regex
 234         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 235             if not dur: dur = '4'
 236             start = float(start)
 237             end = start + float(dur)
 238             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 239             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 240             caption = unescapeHTML(caption)
 241             caption = unescapeHTML(caption) # double cycle, intentional
 242             srt += str(n+1) + '\n'
 243             srt += start + ' --> ' + end + '\n'
 244             srt += caption + '\n\n'
 245         return srt
 246
 247     def _extract_subtitles(self, video_id):
 248         self.report_video_subtitles_download(video_id)
 249         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 250         try:
 251             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 252         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 253             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 254         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 255         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 256         if not srt_lang_list:
 257             return (u'WARNING: video has no closed captions', None)
 258         if self._downloader.params.get('subtitleslang', False):
 259             srt_lang = self._downloader.params.get('subtitleslang')
 260         elif 'en' in srt_lang_list:
 261             srt_lang = 'en'
 262         else:
 263             srt_lang = list(srt_lang_list.keys())[0]
 264         if not srt_lang in srt_lang_list:
 265             return (u'WARNING: no closed captions found in the specified language', None)
 266         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 267         try:
 268             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 269         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 270             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 271         if not srt_xml:
 272             return (u'WARNING: unable to download video subtitles', None)
 273         return (None, self._closed_captions_xml_to_srt(srt_xml))
 274
 275     def _print_formats(self, formats):
 276         print('Available formats:')
 277         for x in formats:
 278             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 279
 280     def _real_initialize(self):
 281         if self._downloader is None:
 282             return
 283
 284         username = None
 285         password = None
 286         downloader_params = self._downloader.params
 287
 288         # Attempt to use provided username and password or .netrc data
 289         if downloader_params.get('username', None) is not None:
 290             username = downloader_params['username']
 291             password = downloader_params['password']
 292         elif downloader_params.get('usenetrc', False):
 293             try:
 294                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 295                 if info is not None:
 296                     username = info[0]
 297                     password = info[2]
 298                 else:
 299                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 300             except (IOError, netrc.NetrcParseError) as err:
 301                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 302                 return
 303
 304         # Set language
 305         request = compat_urllib_request.Request(self._LANG_URL)
 306         try:
 307             self.report_lang()
 308             compat_urllib_request.urlopen(request).read()
 309         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 310             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 311             return
 312
 313         # No authentication to be performed
 314         if username is None:
 315             return
 316
 317         # Log in
 318         login_form = {
 319                 'current_form': 'loginForm',
 320                 'next':     '/',
 321                 'action_login': 'Log In',
 322                 'username': username,
 323                 'password': password,
 324                 }
 325         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 326         try:
 327             self.report_login()
 328             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 329             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 330                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 331                 return
 332         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 333             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 334             return
 335
 336         # Confirm age
 337         age_form = {
 338                 'next_url':     '/',
 339                 'action_confirm':   'Confirm',
 340                 }
 341         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 342         try:
 343             self.report_age_confirmation()
 344             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 345         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 346             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 347             return
 348
 349     def _extract_id(self, url):
 350         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 351         if mobj is None:
 352             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 353             return
 354         video_id = mobj.group(2)
 355         return video_id
 356
 357     def _real_extract(self, url):
 358         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 359         mobj = re.search(self._NEXT_URL_RE, url)
 360         if mobj:
 361             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 362         video_id = self._extract_id(url)
 363
 364         # Get video webpage
 365         self.report_video_webpage_download(video_id)
 366         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 367         request = compat_urllib_request.Request(url)
 368         try:
 369             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 370         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 371             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 372             return
 373
 374         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 375
 376         # Attempt to extract SWF player URL
 377         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 378         if mobj is not None:
 379             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 380         else:
 381             player_url = None
 382
 383         # Get video info
 384         self.report_video_info_webpage_download(video_id)
 385         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 386             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 387                     % (video_id, el_type))
 388             request = compat_urllib_request.Request(video_info_url)
 389             try:
 390                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 391                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 392                 video_info = compat_parse_qs(video_info_webpage)
 393                 if 'token' in video_info:
 394                     break
 395             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 396                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 397                 return
 398         if 'token' not in video_info:
 399             if 'reason' in video_info:
 400                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 401             else:
 402                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 403             return
 404
 405         # Check for "rental" videos
 406         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 407             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 408             return
 409
 410         # Start extracting information
 411         self.report_information_extraction(video_id)
 412
 413         # uploader
 414         if 'author' not in video_info:
 415             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 416             return
 417         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 418
 419         # uploader_id
 420         video_uploader_id = None
 421         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 422         if mobj is not None:
 423             video_uploader_id = mobj.group(1)
 424         else:
 425             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 426
 427         # title
 428         if 'title' not in video_info:
 429             self._downloader.trouble(u'ERROR: unable to extract video title')
 430             return
 431         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 432
 433         # thumbnail image
 434         if 'thumbnail_url' not in video_info:
 435             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 436             video_thumbnail = ''
 437         else:   # don't panic if we can't find it
 438             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 439
 440         # upload date
 441         upload_date = None
 442         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 443         if mobj is not None:
 444             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 445             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 446             for expression in format_expressions:
 447                 try:
 448                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 449                 except:
 450                     pass
 451
 452         # description
 453         video_description = get_element_by_id("eow-description", video_webpage)
 454         if video_description:
 455             video_description = clean_html(video_description)
 456         else:
 457             video_description = ''
 458
 459         # closed captions
 460         video_subtitles = None
 461         if self._downloader.params.get('writesubtitles', False):
 462             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 463             if srt_error:
 464                 self._downloader.trouble(srt_error)
 465
 466         if 'length_seconds' not in video_info:
 467             self._downloader.trouble(u'WARNING: unable to extract video duration')
 468             video_duration = ''
 469         else:
 470             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 471
 472         # token
 473         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 474
 475         # Decide which formats to download
 476         req_format = self._downloader.params.get('format', None)
 477
 478         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 479             self.report_rtmp_download()
 480             video_url_list = [(None, video_info['conn'][0])]
 481         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 482             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 483             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 484             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 485             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 486
 487             format_limit = self._downloader.params.get('format_limit', None)
 488             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 489             if format_limit is not None and format_limit in available_formats:
 490                 format_list = available_formats[available_formats.index(format_limit):]
 491             else:
 492                 format_list = available_formats
 493             existing_formats = [x for x in format_list if x in url_map]
 494             if len(existing_formats) == 0:
 495                 self._downloader.trouble(u'ERROR: no known formats available for video')
 496                 return
 497             if self._downloader.params.get('listformats', None):
 498                 self._print_formats(existing_formats)
 499                 return
 500             if req_format is None or req_format == 'best':
 501                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 502             elif req_format == 'worst':
 503                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 504             elif req_format in ('-1', 'all'):
 505                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 506             else:
 507                 # Specific formats. We pick the first in a slash-delimeted sequence.
 508                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 509                 req_formats = req_format.split('/')
 510                 video_url_list = None
 511                 for rf in req_formats:
 512                     if rf in url_map:
 513                         video_url_list = [(rf, url_map[rf])]
 514                         break
 515                 if video_url_list is None:
 516                     self._downloader.trouble(u'ERROR: requested format not available')
 517                     return
 518         else:
 519             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 520             return
 521
 522         results = []
 523         for format_param, video_real_url in video_url_list:
 524             # Extension
 525             video_extension = self._video_extensions.get(format_param, 'flv')
 526
 527             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 528                                               self._video_dimensions.get(format_param, '???'))
 529
 530             results.append({
 531                 'id':       video_id,
 532                 'url':      video_real_url,
 533                 'uploader': video_uploader,
 534                 'uploader_id': video_uploader_id,
 535                 'upload_date':  upload_date,
 536                 'title':    video_title,
 537                 'ext':      video_extension,
 538                 'format':   video_format,
 539                 'thumbnail':    video_thumbnail,
 540                 'description':  video_description,
 541                 'player_url':   player_url,
 542                 'subtitles':    video_subtitles,
 543                 'duration':     video_duration
 544             })
 545         return results
 546
 547
 548 class MetacafeIE(InfoExtractor):
 549     """Information Extractor for metacafe.com."""
 550
 551     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 552     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 553     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 554     IE_NAME = u'metacafe'
 555
 556     def __init__(self, downloader=None):
 557         InfoExtractor.__init__(self, downloader)
 558
 559     def report_disclaimer(self):
 560         """Report disclaimer retrieval."""
 561         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 562
 563     def report_age_confirmation(self):
 564         """Report attempt to confirm age."""
 565         self._downloader.to_screen(u'[metacafe] Confirming age')
 566
 567     def report_download_webpage(self, video_id):
 568         """Report webpage download."""
 569         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 570
 571     def report_extraction(self, video_id):
 572         """Report information extraction."""
 573         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 574
 575     def _real_initialize(self):
 576         # Retrieve disclaimer
 577         request = compat_urllib_request.Request(self._DISCLAIMER)
 578         try:
 579             self.report_disclaimer()
 580             disclaimer = compat_urllib_request.urlopen(request).read()
 581         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 582             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 583             return
 584
 585         # Confirm age
 586         disclaimer_form = {
 587             'filters': '0',
 588             'submit': "Continue - I'm over 18",
 589             }
 590         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 591         try:
 592             self.report_age_confirmation()
 593             disclaimer = compat_urllib_request.urlopen(request).read()
 594         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 595             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 596             return
 597
 598     def _real_extract(self, url):
 599         # Extract id and simplified title from URL
 600         mobj = re.match(self._VALID_URL, url)
 601         if mobj is None:
 602             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 603             return
 604
 605         video_id = mobj.group(1)
 606
 607         # Check if video comes from YouTube
 608         mobj2 = re.match(r'^yt-(.*)$', video_id)
 609         if mobj2 is not None:
 610             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 611             return
 612
 613         # Retrieve video webpage to extract further information
 614         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 615         try:
 616             self.report_download_webpage(video_id)
 617             webpage = compat_urllib_request.urlopen(request).read()
 618         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 619             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 620             return
 621
 622         # Extract URL, uploader and title from webpage
 623         self.report_extraction(video_id)
 624         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 625         if mobj is not None:
 626             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 627             video_extension = mediaURL[-3:]
 628
 629             # Extract gdaKey if available
 630             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 631             if mobj is None:
 632                 video_url = mediaURL
 633             else:
 634                 gdaKey = mobj.group(1)
 635                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 636         else:
 637             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 638             if mobj is None:
 639                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 640                 return
 641             vardict = compat_parse_qs(mobj.group(1))
 642             if 'mediaData' not in vardict:
 643                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 644                 return
 645             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 646             if mobj is None:
 647                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 648                 return
 649             mediaURL = mobj.group(1).replace('\\/', '/')
 650             video_extension = mediaURL[-3:]
 651             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 652
 653         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 654         if mobj is None:
 655             self._downloader.trouble(u'ERROR: unable to extract title')
 656             return
 657         video_title = mobj.group(1).decode('utf-8')
 658
 659         mobj = re.search(r'submitter=(.*?);', webpage)
 660         if mobj is None:
 661             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 662             return
 663         video_uploader = mobj.group(1)
 664
 665         return [{
 666             'id':       video_id.decode('utf-8'),
 667             'url':      video_url.decode('utf-8'),
 668             'uploader': video_uploader.decode('utf-8'),
 669             'upload_date':  None,
 670             'title':    video_title,
 671             'ext':      video_extension.decode('utf-8'),
 672         }]
 673
 674
 675 class DailymotionIE(InfoExtractor):
 676     """Information Extractor for Dailymotion"""
 677
 678     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 679     IE_NAME = u'dailymotion'
 680
 681     def __init__(self, downloader=None):
 682         InfoExtractor.__init__(self, downloader)
 683
 684     def report_extraction(self, video_id):
 685         """Report information extraction."""
 686         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 687
 688     def _real_extract(self, url):
 689         # Extract id and simplified title from URL
 690         mobj = re.match(self._VALID_URL, url)
 691         if mobj is None:
 692             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 693             return
 694
 695         video_id = mobj.group(1).split('_')[0].split('?')[0]
 696
 697         video_extension = 'mp4'
 698
 699         # Retrieve video webpage to extract further information
 700         request = compat_urllib_request.Request(url)
 701         request.add_header('Cookie', 'family_filter=off')
 702         webpage = self._download_webpage(request, video_id)
 703
 704         # Extract URL, uploader and title from webpage
 705         self.report_extraction(video_id)
 706         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 707         if mobj is None:
 708             self._downloader.trouble(u'ERROR: unable to extract media URL')
 709             return
 710         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 711
 712         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 713             if key in flashvars:
 714                 max_quality = key
 715                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 716                 break
 717         else:
 718             self._downloader.trouble(u'ERROR: unable to extract video URL')
 719             return
 720
 721         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 722         if mobj is None:
 723             self._downloader.trouble(u'ERROR: unable to extract video URL')
 724             return
 725
 726         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 727
 728         # TODO: support choosing qualities
 729
 730         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 731         if mobj is None:
 732             self._downloader.trouble(u'ERROR: unable to extract title')
 733             return
 734         video_title = unescapeHTML(mobj.group('title'))
 735
 736         video_uploader = None
 737         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 738         if mobj is None:
 739             # lookin for official user
 740             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 741             if mobj_official is None:
 742                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 743             else:
 744                 video_uploader = mobj_official.group(1)
 745         else:
 746             video_uploader = mobj.group(1)
 747
 748         video_upload_date = None
 749         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 750         if mobj is not None:
 751             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 752
 753         return [{
 754             'id':       video_id,
 755             'url':      video_url,
 756             'uploader': video_uploader,
 757             'upload_date':  video_upload_date,
 758             'title':    video_title,
 759             'ext':      video_extension,
 760         }]
 761
 762
 763 class PhotobucketIE(InfoExtractor):
 764     """Information extractor for photobucket.com."""
 765
 766     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 767     IE_NAME = u'photobucket'
 768
 769     def __init__(self, downloader=None):
 770         InfoExtractor.__init__(self, downloader)
 771
 772     def report_download_webpage(self, video_id):
 773         """Report webpage download."""
 774         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 775
 776     def report_extraction(self, video_id):
 777         """Report information extraction."""
 778         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 779
 780     def _real_extract(self, url):
 781         # Extract id from URL
 782         mobj = re.match(self._VALID_URL, url)
 783         if mobj is None:
 784             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 785             return
 786
 787         video_id = mobj.group(1)
 788
 789         video_extension = 'flv'
 790
 791         # Retrieve video webpage to extract further information
 792         request = compat_urllib_request.Request(url)
 793         try:
 794             self.report_download_webpage(video_id)
 795             webpage = compat_urllib_request.urlopen(request).read()
 796         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 797             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 798             return
 799
 800         # Extract URL, uploader, and title from webpage
 801         self.report_extraction(video_id)
 802         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 803         if mobj is None:
 804             self._downloader.trouble(u'ERROR: unable to extract media URL')
 805             return
 806         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 807
 808         video_url = mediaURL
 809
 810         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 811         if mobj is None:
 812             self._downloader.trouble(u'ERROR: unable to extract title')
 813             return
 814         video_title = mobj.group(1).decode('utf-8')
 815
 816         video_uploader = mobj.group(2).decode('utf-8')
 817
 818         return [{
 819             'id':       video_id.decode('utf-8'),
 820             'url':      video_url.decode('utf-8'),
 821             'uploader': video_uploader,
 822             'upload_date':  None,
 823             'title':    video_title,
 824             'ext':      video_extension.decode('utf-8'),
 825         }]
 826
 827
 828 class YahooIE(InfoExtractor):
 829     """Information extractor for video.yahoo.com."""
 830
 831     _WORKING = False
 832     # _VALID_URL matches all Yahoo! Video URLs
 833     # _VPAGE_URL matches only the extractable '/watch/' URLs
 834     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 835     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 836     IE_NAME = u'video.yahoo'
 837
 838     def __init__(self, downloader=None):
 839         InfoExtractor.__init__(self, downloader)
 840
 841     def report_download_webpage(self, video_id):
 842         """Report webpage download."""
 843         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 844
 845     def report_extraction(self, video_id):
 846         """Report information extraction."""
 847         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 848
 849     def _real_extract(self, url, new_video=True):
 850         # Extract ID from URL
 851         mobj = re.match(self._VALID_URL, url)
 852         if mobj is None:
 853             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 854             return
 855
 856         video_id = mobj.group(2)
 857         video_extension = 'flv'
 858
 859         # Rewrite valid but non-extractable URLs as
 860         # extractable English language /watch/ URLs
 861         if re.match(self._VPAGE_URL, url) is None:
 862             request = compat_urllib_request.Request(url)
 863             try:
 864                 webpage = compat_urllib_request.urlopen(request).read()
 865             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 866                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 867                 return
 868
 869             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 870             if mobj is None:
 871                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 872                 return
 873             yahoo_id = mobj.group(1)
 874
 875             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 876             if mobj is None:
 877                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 878                 return
 879             yahoo_vid = mobj.group(1)
 880
 881             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 882             return self._real_extract(url, new_video=False)
 883
 884         # Retrieve video webpage to extract further information
 885         request = compat_urllib_request.Request(url)
 886         try:
 887             self.report_download_webpage(video_id)
 888             webpage = compat_urllib_request.urlopen(request).read()
 889         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 890             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 891             return
 892
 893         # Extract uploader and title from webpage
 894         self.report_extraction(video_id)
 895         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 896         if mobj is None:
 897             self._downloader.trouble(u'ERROR: unable to extract video title')
 898             return
 899         video_title = mobj.group(1).decode('utf-8')
 900
 901         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 902         if mobj is None:
 903             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 904             return
 905         video_uploader = mobj.group(1).decode('utf-8')
 906
 907         # Extract video thumbnail
 908         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 909         if mobj is None:
 910             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 911             return
 912         video_thumbnail = mobj.group(1).decode('utf-8')
 913
 914         # Extract video description
 915         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 916         if mobj is None:
 917             self._downloader.trouble(u'ERROR: unable to extract video description')
 918             return
 919         video_description = mobj.group(1).decode('utf-8')
 920         if not video_description:
 921             video_description = 'No description available.'
 922
 923         # Extract video height and width
 924         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 925         if mobj is None:
 926             self._downloader.trouble(u'ERROR: unable to extract video height')
 927             return
 928         yv_video_height = mobj.group(1)
 929
 930         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 931         if mobj is None:
 932             self._downloader.trouble(u'ERROR: unable to extract video width')
 933             return
 934         yv_video_width = mobj.group(1)
 935
 936         # Retrieve video playlist to extract media URL
 937         # I'm not completely sure what all these options are, but we
 938         # seem to need most of them, otherwise the server sends a 401.
 939         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 940         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 941         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 942                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 943                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 944         try:
 945             self.report_download_webpage(video_id)
 946             webpage = compat_urllib_request.urlopen(request).read()
 947         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 948             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 949             return
 950
 951         # Extract media URL from playlist XML
 952         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 953         if mobj is None:
 954             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 955             return
 956         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 957         video_url = unescapeHTML(video_url)
 958
 959         return [{
 960             'id':       video_id.decode('utf-8'),
 961             'url':      video_url,
 962             'uploader': video_uploader,
 963             'upload_date':  None,
 964             'title':    video_title,
 965             'ext':      video_extension.decode('utf-8'),
 966             'thumbnail':    video_thumbnail.decode('utf-8'),
 967             'description':  video_description,
 968         }]
 969
 970
 971 class VimeoIE(InfoExtractor):
 972     """Information extractor for vimeo.com."""
 973
 974     # _VALID_URL matches Vimeo URLs
 975     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 976     IE_NAME = u'vimeo'
 977
 978     def __init__(self, downloader=None):
 979         InfoExtractor.__init__(self, downloader)
 980
 981     def report_download_webpage(self, video_id):
 982         """Report webpage download."""
 983         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 984
 985     def report_extraction(self, video_id):
 986         """Report information extraction."""
 987         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 988
 989     def _real_extract(self, url, new_video=True):
 990         # Extract ID from URL
 991         mobj = re.match(self._VALID_URL, url)
 992         if mobj is None:
 993             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 994             return
 995
 996         video_id = mobj.group(1)
 997
 998         # Retrieve video webpage to extract further information
 999         request = compat_urllib_request.Request(url, None, std_headers)
1000         try:
1001             self.report_download_webpage(video_id)
1002             webpage_bytes = compat_urllib_request.urlopen(request).read()
1003             webpage = webpage_bytes.decode('utf-8')
1004         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1005             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1006             return
1007
1008         # Now we begin extracting as much information as we can from what we
1009         # retrieved. First we extract the information common to all extractors,
1010         # and latter we extract those that are Vimeo specific.
1011         self.report_extraction(video_id)
1012
1013         # Extract the config JSON
1014         try:
1015             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1016             config = json.loads(config)
1017         except:
1018             self._downloader.trouble(u'ERROR: unable to extract info section')
1019             return
1020
1021         # Extract title
1022         video_title = config["video"]["title"]
1023
1024         # Extract uploader and uploader_id
1025         video_uploader = config["video"]["owner"]["name"]
1026         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1027
1028         # Extract video thumbnail
1029         video_thumbnail = config["video"]["thumbnail"]
1030
1031         # Extract video description
1032         video_description = get_element_by_attribute("itemprop", "description", webpage)
1033         if video_description: video_description = clean_html(video_description)
1034         else: video_description = ''
1035
1036         # Extract upload date
1037         video_upload_date = None
1038         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1039         if mobj is not None:
1040             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1041
1042         # Vimeo specific: extract request signature and timestamp
1043         sig = config['request']['signature']
1044         timestamp = config['request']['timestamp']
1045
1046         # Vimeo specific: extract video codec and quality information
1047         # First consider quality, then codecs, then take everything
1048         # TODO bind to format param
1049         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1050         files = { 'hd': [], 'sd': [], 'other': []}
1051         for codec_name, codec_extension in codecs:
1052             if codec_name in config["video"]["files"]:
1053                 if 'hd' in config["video"]["files"][codec_name]:
1054                     files['hd'].append((codec_name, codec_extension, 'hd'))
1055                 elif 'sd' in config["video"]["files"][codec_name]:
1056                     files['sd'].append((codec_name, codec_extension, 'sd'))
1057                 else:
1058                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1059
1060         for quality in ('hd', 'sd', 'other'):
1061             if len(files[quality]) > 0:
1062                 video_quality = files[quality][0][2]
1063                 video_codec = files[quality][0][0]
1064                 video_extension = files[quality][0][1]
1065                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1066                 break
1067         else:
1068             self._downloader.trouble(u'ERROR: no known codec found')
1069             return
1070
1071         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1072                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1073
1074         return [{
1075             'id':       video_id,
1076             'url':      video_url,
1077             'uploader': video_uploader,
1078             'uploader_id': video_uploader_id,
1079             'upload_date':  video_upload_date,
1080             'title':    video_title,
1081             'ext':      video_extension,
1082             'thumbnail':    video_thumbnail,
1083             'description':  video_description,
1084         }]
1085
1086
1087 class ArteTvIE(InfoExtractor):
1088     """arte.tv information extractor."""
1089
1090     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1091     _LIVE_URL = r'index-[0-9]+\.html$'
1092
1093     IE_NAME = u'arte.tv'
1094
1095     def __init__(self, downloader=None):
1096         InfoExtractor.__init__(self, downloader)
1097
1098     def report_download_webpage(self, video_id):
1099         """Report webpage download."""
1100         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1101
1102     def report_extraction(self, video_id):
1103         """Report information extraction."""
1104         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1105
1106     def fetch_webpage(self, url):
1107         request = compat_urllib_request.Request(url)
1108         try:
1109             self.report_download_webpage(url)
1110             webpage = compat_urllib_request.urlopen(request).read()
1111         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1112             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1113             return
1114         except ValueError as err:
1115             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1116             return
1117         return webpage
1118
1119     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1120         page = self.fetch_webpage(url)
1121         mobj = re.search(regex, page, regexFlags)
1122         info = {}
1123
1124         if mobj is None:
1125             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1126             return
1127
1128         for (i, key, err) in matchTuples:
1129             if mobj.group(i) is None:
1130                 self._downloader.trouble(err)
1131                 return
1132             else:
1133                 info[key] = mobj.group(i)
1134
1135         return info
1136
1137     def extractLiveStream(self, url):
1138         video_lang = url.split('/')[-4]
1139         info = self.grep_webpage(
1140             url,
1141             r'src="(.*?/videothek_js.*?\.js)',
1142             0,
1143             [
1144                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1145             ]
1146         )
1147         http_host = url.split('/')[2]
1148         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1149         info = self.grep_webpage(
1150             next_url,
1151             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1152                 '(http://.*?\.swf).*?' +
1153                 '(rtmp://.*?)\'',
1154             re.DOTALL,
1155             [
1156                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1157                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1158                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1159             ]
1160         )
1161         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1162
1163     def extractPlus7Stream(self, url):
1164         video_lang = url.split('/')[-3]
1165         info = self.grep_webpage(
1166             url,
1167             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1168             0,
1169             [
1170                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1171             ]
1172         )
1173         next_url = compat_urllib_parse.unquote(info.get('url'))
1174         info = self.grep_webpage(
1175             next_url,
1176             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1177             0,
1178             [
1179                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1180             ]
1181         )
1182         next_url = compat_urllib_parse.unquote(info.get('url'))
1183
1184         info = self.grep_webpage(
1185             next_url,
1186             r'<video id="(.*?)".*?>.*?' +
1187                 '<name>(.*?)</name>.*?' +
1188                 '<dateVideo>(.*?)</dateVideo>.*?' +
1189                 '<url quality="hd">(.*?)</url>',
1190             re.DOTALL,
1191             [
1192                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1193                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1194                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1195                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1196             ]
1197         )
1198
1199         return {
1200             'id':           info.get('id'),
1201             'url':          compat_urllib_parse.unquote(info.get('url')),
1202             'uploader':     u'arte.tv',
1203             'upload_date':  info.get('date'),
1204             'title':        info.get('title').decode('utf-8'),
1205             'ext':          u'mp4',
1206             'format':       u'NA',
1207             'player_url':   None,
1208         }
1209
1210     def _real_extract(self, url):
1211         video_id = url.split('/')[-1]
1212         self.report_extraction(video_id)
1213
1214         if re.search(self._LIVE_URL, video_id) is not None:
1215             self.extractLiveStream(url)
1216             return
1217         else:
1218             info = self.extractPlus7Stream(url)
1219
1220         return [info]
1221
1222
1223 class GenericIE(InfoExtractor):
1224     """Generic last-resort information extractor."""
1225
1226     _VALID_URL = r'.*'
1227     IE_NAME = u'generic'
1228
1229     def __init__(self, downloader=None):
1230         InfoExtractor.__init__(self, downloader)
1231
1232     def report_download_webpage(self, video_id):
1233         """Report webpage download."""
1234         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1235         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1236
1237     def report_extraction(self, video_id):
1238         """Report information extraction."""
1239         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1240
1241     def report_following_redirect(self, new_url):
1242         """Report information extraction."""
1243         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1244
1245     def _test_redirect(self, url):
1246         """Check if it is a redirect, like url shorteners, in case restart chain."""
1247         class HeadRequest(compat_urllib_request.Request):
1248             def get_method(self):
1249                 return "HEAD"
1250
1251         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1252             """
1253             Subclass the HTTPRedirectHandler to make it use our
1254             HeadRequest also on the redirected URL
1255             """
1256             def redirect_request(self, req, fp, code, msg, headers, newurl):
1257                 if code in (301, 302, 303, 307):
1258                     newurl = newurl.replace(' ', '%20')
1259                     newheaders = dict((k,v) for k,v in req.headers.items()
1260                                       if k.lower() not in ("content-length", "content-type"))
1261                     return HeadRequest(newurl,
1262                                        headers=newheaders,
1263                                        origin_req_host=req.get_origin_req_host(),
1264                                        unverifiable=True)
1265                 else:
1266                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1267
1268         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1269             """
1270             Fallback to GET if HEAD is not allowed (405 HTTP error)
1271             """
1272             def http_error_405(self, req, fp, code, msg, headers):
1273                 fp.read()
1274                 fp.close()
1275
1276                 newheaders = dict((k,v) for k,v in req.headers.items()
1277                                   if k.lower() not in ("content-length", "content-type"))
1278                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1279                                                  headers=newheaders,
1280                                                  origin_req_host=req.get_origin_req_host(),
1281                                                  unverifiable=True))
1282
1283         # Build our opener
1284         opener = compat_urllib_request.OpenerDirector()
1285         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1286                         HTTPMethodFallback, HEADRedirectHandler,
1287                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1288             opener.add_handler(handler())
1289
1290         response = opener.open(HeadRequest(url))
1291         new_url = response.geturl()
1292
1293         if url == new_url:
1294             return False
1295
1296         self.report_following_redirect(new_url)
1297         self._downloader.download([new_url])
1298         return True
1299
1300     def _real_extract(self, url):
1301         if self._test_redirect(url): return
1302
1303         video_id = url.split('/')[-1]
1304         request = compat_urllib_request.Request(url)
1305         try:
1306             self.report_download_webpage(video_id)
1307             webpage = compat_urllib_request.urlopen(request).read()
1308         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1309             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1310             return
1311         except ValueError as err:
1312             # since this is the last-resort InfoExtractor, if
1313             # this error is thrown, it'll be thrown here
1314             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1315             return
1316
1317         self.report_extraction(video_id)
1318         # Start with something easy: JW Player in SWFObject
1319         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1320         if mobj is None:
1321             # Broaden the search a little bit
1322             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1323         if mobj is None:
1324             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1325             return
1326
1327         # It's possible that one of the regexes
1328         # matched, but returned an empty group:
1329         if mobj.group(1) is None:
1330             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1331             return
1332
1333         video_url = compat_urllib_parse.unquote(mobj.group(1))
1334         video_id = os.path.basename(video_url)
1335
1336         # here's a fun little line of code for you:
1337         video_extension = os.path.splitext(video_id)[1][1:]
1338         video_id = os.path.splitext(video_id)[0]
1339
1340         # it's tempting to parse this further, but you would
1341         # have to take into account all the variations like
1342         #   Video Title - Site Name
1343         #   Site Name | Video Title
1344         #   Video Title - Tagline | Site Name
1345         # and so on and so forth; it's just not practical
1346         mobj = re.search(r'<title>(.*)</title>', webpage)
1347         if mobj is None:
1348             self._downloader.trouble(u'ERROR: unable to extract title')
1349             return
1350         video_title = mobj.group(1)
1351
1352         # video uploader is domain name
1353         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1354         if mobj is None:
1355             self._downloader.trouble(u'ERROR: unable to extract title')
1356             return
1357         video_uploader = mobj.group(1)
1358
1359         return [{
1360             'id':       video_id,
1361             'url':      video_url,
1362             'uploader': video_uploader,
1363             'upload_date':  None,
1364             'title':    video_title,
1365             'ext':      video_extension,
1366         }]
1367
1368
1369 class YoutubeSearchIE(InfoExtractor):
1370     """Information Extractor for YouTube search queries."""
1371     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1372     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1373     _max_youtube_results = 1000
1374     IE_NAME = u'youtube:search'
1375
1376     def __init__(self, downloader=None):
1377         InfoExtractor.__init__(self, downloader)
1378
1379     def report_download_page(self, query, pagenum):
1380         """Report attempt to download search page with given number."""
1381         query = query.decode(preferredencoding())
1382         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1383
1384     def _real_extract(self, query):
1385         mobj = re.match(self._VALID_URL, query)
1386         if mobj is None:
1387             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1388             return
1389
1390         prefix, query = query.split(':')
1391         prefix = prefix[8:]
1392         query = query.encode('utf-8')
1393         if prefix == '':
1394             self._download_n_results(query, 1)
1395             return
1396         elif prefix == 'all':
1397             self._download_n_results(query, self._max_youtube_results)
1398             return
1399         else:
1400             try:
1401                 n = int(prefix)
1402                 if n <= 0:
1403                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1404                     return
1405                 elif n > self._max_youtube_results:
1406                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1407                     n = self._max_youtube_results
1408                 self._download_n_results(query, n)
1409                 return
1410             except ValueError: # parsing prefix as integer fails
1411                 self._download_n_results(query, 1)
1412                 return
1413
1414     def _download_n_results(self, query, n):
1415         """Downloads a specified number of results for a query"""
1416
1417         video_ids = []
1418         pagenum = 0
1419         limit = n
1420
1421         while (50 * pagenum) < limit:
1422             self.report_download_page(query, pagenum+1)
1423             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1424             request = compat_urllib_request.Request(result_url)
1425             try:
1426                 data = compat_urllib_request.urlopen(request).read()
1427             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1428                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1429                 return
1430             api_response = json.loads(data)['data']
1431
1432             new_ids = list(video['id'] for video in api_response['items'])
1433             video_ids += new_ids
1434
1435             limit = min(n, api_response['totalItems'])
1436             pagenum += 1
1437
1438         if len(video_ids) > n:
1439             video_ids = video_ids[:n]
1440         for id in video_ids:
1441             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1442         return
1443
1444
1445 class GoogleSearchIE(InfoExtractor):
1446     """Information Extractor for Google Video search queries."""
1447     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1448     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1449     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1450     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1451     _max_google_results = 1000
1452     IE_NAME = u'video.google:search'
1453
1454     def __init__(self, downloader=None):
1455         InfoExtractor.__init__(self, downloader)
1456
1457     def report_download_page(self, query, pagenum):
1458         """Report attempt to download playlist page with given number."""
1459         query = query.decode(preferredencoding())
1460         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1461
1462     def _real_extract(self, query):
1463         mobj = re.match(self._VALID_URL, query)
1464         if mobj is None:
1465             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1466             return
1467
1468         prefix, query = query.split(':')
1469         prefix = prefix[8:]
1470         query = query.encode('utf-8')
1471         if prefix == '':
1472             self._download_n_results(query, 1)
1473             return
1474         elif prefix == 'all':
1475             self._download_n_results(query, self._max_google_results)
1476             return
1477         else:
1478             try:
1479                 n = int(prefix)
1480                 if n <= 0:
1481                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1482                     return
1483                 elif n > self._max_google_results:
1484                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1485                     n = self._max_google_results
1486                 self._download_n_results(query, n)
1487                 return
1488             except ValueError: # parsing prefix as integer fails
1489                 self._download_n_results(query, 1)
1490                 return
1491
1492     def _download_n_results(self, query, n):
1493         """Downloads a specified number of results for a query"""
1494
1495         video_ids = []
1496         pagenum = 0
1497
1498         while True:
1499             self.report_download_page(query, pagenum)
1500             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1501             request = compat_urllib_request.Request(result_url)
1502             try:
1503                 page = compat_urllib_request.urlopen(request).read()
1504             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1505                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1506                 return
1507
1508             # Extract video identifiers
1509             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1510                 video_id = mobj.group(1)
1511                 if video_id not in video_ids:
1512                     video_ids.append(video_id)
1513                     if len(video_ids) == n:
1514                         # Specified n videos reached
1515                         for id in video_ids:
1516                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1517                         return
1518
1519             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1520                 for id in video_ids:
1521                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1522                 return
1523
1524             pagenum = pagenum + 1
1525
1526
1527 class YahooSearchIE(InfoExtractor):
1528     """Information Extractor for Yahoo! Video search queries."""
1529
1530     _WORKING = False
1531     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1532     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1533     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1534     _MORE_PAGES_INDICATOR = r'\s*Next'
1535     _max_yahoo_results = 1000
1536     IE_NAME = u'video.yahoo:search'
1537
1538     def __init__(self, downloader=None):
1539         InfoExtractor.__init__(self, downloader)
1540
1541     def report_download_page(self, query, pagenum):
1542         """Report attempt to download playlist page with given number."""
1543         query = query.decode(preferredencoding())
1544         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1545
1546     def _real_extract(self, query):
1547         mobj = re.match(self._VALID_URL, query)
1548         if mobj is None:
1549             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1550             return
1551
1552         prefix, query = query.split(':')
1553         prefix = prefix[8:]
1554         query = query.encode('utf-8')
1555         if prefix == '':
1556             self._download_n_results(query, 1)
1557             return
1558         elif prefix == 'all':
1559             self._download_n_results(query, self._max_yahoo_results)
1560             return
1561         else:
1562             try:
1563                 n = int(prefix)
1564                 if n <= 0:
1565                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1566                     return
1567                 elif n > self._max_yahoo_results:
1568                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1569                     n = self._max_yahoo_results
1570                 self._download_n_results(query, n)
1571                 return
1572             except ValueError: # parsing prefix as integer fails
1573                 self._download_n_results(query, 1)
1574                 return
1575
1576     def _download_n_results(self, query, n):
1577         """Downloads a specified number of results for a query"""
1578
1579         video_ids = []
1580         already_seen = set()
1581         pagenum = 1
1582
1583         while True:
1584             self.report_download_page(query, pagenum)
1585             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1586             request = compat_urllib_request.Request(result_url)
1587             try:
1588                 page = compat_urllib_request.urlopen(request).read()
1589             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1590                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1591                 return
1592
1593             # Extract video identifiers
1594             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1595                 video_id = mobj.group(1)
1596                 if video_id not in already_seen:
1597                     video_ids.append(video_id)
1598                     already_seen.add(video_id)
1599                     if len(video_ids) == n:
1600                         # Specified n videos reached
1601                         for id in video_ids:
1602                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1603                         return
1604
1605             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1606                 for id in video_ids:
1607                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1608                 return
1609
1610             pagenum = pagenum + 1
1611
1612
1613 class YoutubePlaylistIE(InfoExtractor):
1614     """Information Extractor for YouTube playlists."""
1615
1616     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1617     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1618     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1619     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1620     IE_NAME = u'youtube:playlist'
1621
1622     def __init__(self, downloader=None):
1623         InfoExtractor.__init__(self, downloader)
1624
1625     def report_download_page(self, playlist_id, pagenum):
1626         """Report attempt to download playlist page with given number."""
1627         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1628
1629     def _real_extract(self, url):
1630         # Extract playlist id
1631         mobj = re.match(self._VALID_URL, url)
1632         if mobj is None:
1633             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1634             return
1635
1636         # Single video case
1637         if mobj.group(3) is not None:
1638             self._downloader.download([mobj.group(3)])
1639             return
1640
1641         # Download playlist pages
1642         # prefix is 'p' as default for playlists but there are other types that need extra care
1643         playlist_prefix = mobj.group(1)
1644         if playlist_prefix == 'a':
1645             playlist_access = 'artist'
1646         else:
1647             playlist_prefix = 'p'
1648             playlist_access = 'view_play_list'
1649         playlist_id = mobj.group(2)
1650         video_ids = []
1651         pagenum = 1
1652
1653         while True:
1654             self.report_download_page(playlist_id, pagenum)
1655             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1656             request = compat_urllib_request.Request(url)
1657             try:
1658                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1659             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1660                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1661                 return
1662
1663             # Extract video identifiers
1664             ids_in_page = []
1665             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1666                 if mobj.group(1) not in ids_in_page:
1667                     ids_in_page.append(mobj.group(1))
1668             video_ids.extend(ids_in_page)
1669
1670             if self._MORE_PAGES_INDICATOR not in page:
1671                 break
1672             pagenum = pagenum + 1
1673
1674         total = len(video_ids)
1675
1676         playliststart = self._downloader.params.get('playliststart', 1) - 1
1677         playlistend = self._downloader.params.get('playlistend', -1)
1678         if playlistend == -1:
1679             video_ids = video_ids[playliststart:]
1680         else:
1681             video_ids = video_ids[playliststart:playlistend]
1682
1683         if len(video_ids) == total:
1684             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1685         else:
1686             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1687
1688         for id in video_ids:
1689             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1690         return
1691
1692
1693 class YoutubeChannelIE(InfoExtractor):
1694     """Information Extractor for YouTube channels."""
1695
1696     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1697     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1698     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1699     IE_NAME = u'youtube:channel'
1700
1701     def report_download_page(self, channel_id, pagenum):
1702         """Report attempt to download channel page with given number."""
1703         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1704
1705     def _real_extract(self, url):
1706         # Extract channel id
1707         mobj = re.match(self._VALID_URL, url)
1708         if mobj is None:
1709             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1710             return
1711
1712         # Download channel pages
1713         channel_id = mobj.group(1)
1714         video_ids = []
1715         pagenum = 1
1716
1717         while True:
1718             self.report_download_page(channel_id, pagenum)
1719             url = self._TEMPLATE_URL % (channel_id, pagenum)
1720             request = compat_urllib_request.Request(url)
1721             try:
1722                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1723             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1724                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1725                 return
1726
1727             # Extract video identifiers
1728             ids_in_page = []
1729             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1730                 if mobj.group(1) not in ids_in_page:
1731                     ids_in_page.append(mobj.group(1))
1732             video_ids.extend(ids_in_page)
1733
1734             if self._MORE_PAGES_INDICATOR not in page:
1735                 break
1736             pagenum = pagenum + 1
1737
1738         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1739
1740         for id in video_ids:
1741             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1742         return
1743
1744
1745 class YoutubeUserIE(InfoExtractor):
1746     """Information Extractor for YouTube users."""
1747
1748     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1749     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1750     _GDATA_PAGE_SIZE = 50
1751     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1752     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1753     IE_NAME = u'youtube:user'
1754
1755     def __init__(self, downloader=None):
1756         InfoExtractor.__init__(self, downloader)
1757
1758     def report_download_page(self, username, start_index):
1759         """Report attempt to download user page."""
1760         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1761                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1762
1763     def _real_extract(self, url):
1764         # Extract username
1765         mobj = re.match(self._VALID_URL, url)
1766         if mobj is None:
1767             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1768             return
1769
1770         username = mobj.group(1)
1771
1772         # Download video ids using YouTube Data API. Result size per
1773         # query is limited (currently to 50 videos) so we need to query
1774         # page by page until there are no video ids - it means we got
1775         # all of them.
1776
1777         video_ids = []
1778         pagenum = 0
1779
1780         while True:
1781             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1782             self.report_download_page(username, start_index)
1783
1784             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1785
1786             try:
1787                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1788             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1789                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1790                 return
1791
1792             # Extract video identifiers
1793             ids_in_page = []
1794
1795             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1796                 if mobj.group(1) not in ids_in_page:
1797                     ids_in_page.append(mobj.group(1))
1798
1799             video_ids.extend(ids_in_page)
1800
1801             # A little optimization - if current page is not
1802             # "full", ie. does not contain PAGE_SIZE video ids then
1803             # we can assume that this page is the last one - there
1804             # are no more ids on further pages - no need to query
1805             # again.
1806
1807             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1808                 break
1809
1810             pagenum += 1
1811
1812         all_ids_count = len(video_ids)
1813         playliststart = self._downloader.params.get('playliststart', 1) - 1
1814         playlistend = self._downloader.params.get('playlistend', -1)
1815
1816         if playlistend == -1:
1817             video_ids = video_ids[playliststart:]
1818         else:
1819             video_ids = video_ids[playliststart:playlistend]
1820
1821         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1822                 (username, all_ids_count, len(video_ids)))
1823
1824         for video_id in video_ids:
1825             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1826
1827
1828 class BlipTVUserIE(InfoExtractor):
1829     """Information Extractor for blip.tv users."""
1830
1831     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1832     _PAGE_SIZE = 12
1833     IE_NAME = u'blip.tv:user'
1834
1835     def __init__(self, downloader=None):
1836         InfoExtractor.__init__(self, downloader)
1837
1838     def report_download_page(self, username, pagenum):
1839         """Report attempt to download user page."""
1840         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1841                 (self.IE_NAME, username, pagenum))
1842
1843     def _real_extract(self, url):
1844         # Extract username
1845         mobj = re.match(self._VALID_URL, url)
1846         if mobj is None:
1847             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1848             return
1849
1850         username = mobj.group(1)
1851
1852         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1853
1854         request = compat_urllib_request.Request(url)
1855
1856         try:
1857             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1858             mobj = re.search(r'data-users-id="([^"]+)"', page)
1859             page_base = page_base % mobj.group(1)
1860         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1861             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1862             return
1863
1864
1865         # Download video ids using BlipTV Ajax calls. Result size per
1866         # query is limited (currently to 12 videos) so we need to query
1867         # page by page until there are no video ids - it means we got
1868         # all of them.
1869
1870         video_ids = []
1871         pagenum = 1
1872
1873         while True:
1874             self.report_download_page(username, pagenum)
1875
1876             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1877
1878             try:
1879                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1880             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1881                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1882                 return
1883
1884             # Extract video identifiers
1885             ids_in_page = []
1886
1887             for mobj in re.finditer(r'href="/([^"]+)"', page):
1888                 if mobj.group(1) not in ids_in_page:
1889                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1890
1891             video_ids.extend(ids_in_page)
1892
1893             # A little optimization - if current page is not
1894             # "full", ie. does not contain PAGE_SIZE video ids then
1895             # we can assume that this page is the last one - there
1896             # are no more ids on further pages - no need to query
1897             # again.
1898
1899             if len(ids_in_page) < self._PAGE_SIZE:
1900                 break
1901
1902             pagenum += 1
1903
1904         all_ids_count = len(video_ids)
1905         playliststart = self._downloader.params.get('playliststart', 1) - 1
1906         playlistend = self._downloader.params.get('playlistend', -1)
1907
1908         if playlistend == -1:
1909             video_ids = video_ids[playliststart:]
1910         else:
1911             video_ids = video_ids[playliststart:playlistend]
1912
1913         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1914                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1915
1916         for video_id in video_ids:
1917             self._downloader.download([u'http://blip.tv/'+video_id])
1918
1919
1920 class DepositFilesIE(InfoExtractor):
1921     """Information extractor for depositfiles.com"""
1922
1923     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1924
1925     def report_download_webpage(self, file_id):
1926         """Report webpage download."""
1927         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1928
1929     def report_extraction(self, file_id):
1930         """Report information extraction."""
1931         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1932
1933     def _real_extract(self, url):
1934         file_id = url.split('/')[-1]
1935         # Rebuild url in english locale
1936         url = 'http://depositfiles.com/en/files/' + file_id
1937
1938         # Retrieve file webpage with 'Free download' button pressed
1939         free_download_indication = { 'gateway_result' : '1' }
1940         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1941         try:
1942             self.report_download_webpage(file_id)
1943             webpage = compat_urllib_request.urlopen(request).read()
1944         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1945             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1946             return
1947
1948         # Search for the real file URL
1949         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1950         if (mobj is None) or (mobj.group(1) is None):
1951             # Try to figure out reason of the error.
1952             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1953             if (mobj is not None) and (mobj.group(1) is not None):
1954                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1955                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1956             else:
1957                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1958             return
1959
1960         file_url = mobj.group(1)
1961         file_extension = os.path.splitext(file_url)[1][1:]
1962
1963         # Search for file title
1964         mobj = re.search(r'<b title="(.*?)">', webpage)
1965         if mobj is None:
1966             self._downloader.trouble(u'ERROR: unable to extract title')
1967             return
1968         file_title = mobj.group(1).decode('utf-8')
1969
1970         return [{
1971             'id':       file_id.decode('utf-8'),
1972             'url':      file_url.decode('utf-8'),
1973             'uploader': None,
1974             'upload_date':  None,
1975             'title':    file_title,
1976             'ext':      file_extension.decode('utf-8'),
1977         }]
1978
1979
1980 class FacebookIE(InfoExtractor):
1981     """Information Extractor for Facebook"""
1982
1983     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1984     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1985     _NETRC_MACHINE = 'facebook'
1986     IE_NAME = u'facebook'
1987
1988     def report_login(self):
1989         """Report attempt to log in."""
1990         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
1991
1992     def _real_initialize(self):
1993         if self._downloader is None:
1994             return
1995
1996         useremail = None
1997         password = None
1998         downloader_params = self._downloader.params
1999
2000         # Attempt to use provided username and password or .netrc data
2001         if downloader_params.get('username', None) is not None:
2002             useremail = downloader_params['username']
2003             password = downloader_params['password']
2004         elif downloader_params.get('usenetrc', False):
2005             try:
2006                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2007                 if info is not None:
2008                     useremail = info[0]
2009                     password = info[2]
2010                 else:
2011                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2012             except (IOError, netrc.NetrcParseError) as err:
2013                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2014                 return
2015
2016         if useremail is None:
2017             return
2018
2019         # Log in
2020         login_form = {
2021             'email': useremail,
2022             'pass': password,
2023             'login': 'Log+In'
2024             }
2025         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2026         try:
2027             self.report_login()
2028             login_results = compat_urllib_request.urlopen(request).read()
2029             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2030                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2031                 return
2032         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2033             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2034             return
2035
2036     def _real_extract(self, url):
2037         mobj = re.match(self._VALID_URL, url)
2038         if mobj is None:
2039             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2040             return
2041         video_id = mobj.group('ID')
2042
2043         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2044         webpage = self._download_webpage(url, video_id)
2045
2046         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2047         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2048         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2049         if not m:
2050             raise ExtractorError(u'Cannot parse data')
2051         data = dict(json.loads(m.group(1)))
2052         video_url = compat_urllib_parse.unquote(data['hd_src'])
2053         video_duration = int(data['video_duration'])
2054
2055         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2056         if not m:
2057             raise ExtractorError(u'Cannot find title in webpage')
2058         video_title = unescapeHTML(m.group(1))
2059
2060         info = {
2061             'id': video_id,
2062             'title': video_title,
2063             'url': video_url,
2064             'ext': 'mp4',
2065             'duration': video_duration,
2066             'thumbnail': data['thumbnail_src'],
2067         }
2068         return [info]
2069
2070
2071 class BlipTVIE(InfoExtractor):
2072     """Information extractor for blip.tv"""
2073
2074     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2075     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2076     IE_NAME = u'blip.tv'
2077
2078     def report_extraction(self, file_id):
2079         """Report information extraction."""
2080         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2081
2082     def report_direct_download(self, title):
2083         """Report information extraction."""
2084         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2085
2086     def _real_extract(self, url):
2087         mobj = re.match(self._VALID_URL, url)
2088         if mobj is None:
2089             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2090             return
2091
2092         if '?' in url:
2093             cchar = '&'
2094         else:
2095             cchar = '?'
2096         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2097         request = compat_urllib_request.Request(json_url)
2098         request.add_header('User-Agent', 'iTunes/10.6.1')
2099         self.report_extraction(mobj.group(1))
2100         info = None
2101         try:
2102             urlh = compat_urllib_request.urlopen(request)
2103             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2104                 basename = url.split('/')[-1]
2105                 title,ext = os.path.splitext(basename)
2106                 title = title.decode('UTF-8')
2107                 ext = ext.replace('.', '')
2108                 self.report_direct_download(title)
2109                 info = {
2110                     'id': title,
2111                     'url': url,
2112                     'uploader': None,
2113                     'upload_date': None,
2114                     'title': title,
2115                     'ext': ext,
2116                     'urlhandle': urlh
2117                 }
2118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2119             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2120         if info is None: # Regular URL
2121             try:
2122                 json_code_bytes = urlh.read()
2123                 json_code = json_code_bytes.decode('utf-8')
2124             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2125                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2126                 return
2127
2128             try:
2129                 json_data = json.loads(json_code)
2130                 if 'Post' in json_data:
2131                     data = json_data['Post']
2132                 else:
2133                     data = json_data
2134
2135                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2136                 video_url = data['media']['url']
2137                 umobj = re.match(self._URL_EXT, video_url)
2138                 if umobj is None:
2139                     raise ValueError('Can not determine filename extension')
2140                 ext = umobj.group(1)
2141
2142                 info = {
2143                     'id': data['item_id'],
2144                     'url': video_url,
2145                     'uploader': data['display_name'],
2146                     'upload_date': upload_date,
2147                     'title': data['title'],
2148                     'ext': ext,
2149                     'format': data['media']['mimeType'],
2150                     'thumbnail': data['thumbnailUrl'],
2151                     'description': data['description'],
2152                     'player_url': data['embedUrl'],
2153                     'user_agent': 'iTunes/10.6.1',
2154                 }
2155             except (ValueError,KeyError) as err:
2156                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2157                 return
2158
2159         return [info]
2160
2161
2162 class MyVideoIE(InfoExtractor):
2163     """Information Extractor for myvideo.de."""
2164
2165     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2166     IE_NAME = u'myvideo'
2167
2168     def __init__(self, downloader=None):
2169         InfoExtractor.__init__(self, downloader)
2170
2171     def report_extraction(self, video_id):
2172         """Report information extraction."""
2173         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2174
2175     def _real_extract(self,url):
2176         mobj = re.match(self._VALID_URL, url)
2177         if mobj is None:
2178             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2179             return
2180
2181         video_id = mobj.group(1)
2182
2183         # Get video webpage
2184         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2185         webpage = self._download_webpage(webpage_url, video_id)
2186
2187         self.report_extraction(video_id)
2188         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2189                  webpage)
2190         if mobj is None:
2191             self._downloader.trouble(u'ERROR: unable to extract media URL')
2192             return
2193         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2194
2195         mobj = re.search('<title>([^<]+)</title>', webpage)
2196         if mobj is None:
2197             self._downloader.trouble(u'ERROR: unable to extract title')
2198             return
2199
2200         video_title = mobj.group(1)
2201
2202         return [{
2203             'id':       video_id,
2204             'url':      video_url,
2205             'uploader': None,
2206             'upload_date':  None,
2207             'title':    video_title,
2208             'ext':      u'flv',
2209         }]
2210
2211 class ComedyCentralIE(InfoExtractor):
2212     """Information extractor for The Daily Show and Colbert Report """
2213
2214     # urls can be abbreviations like :thedailyshow or :colbert
2215     # urls for episodes like:
2216     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2217     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2218     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2219     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2220                       |(https?://)?(www\.)?
2221                           (?P<showname>thedailyshow|colbertnation)\.com/
2222                          (full-episodes/(?P<episode>.*)|
2223                           (?P<clip>
2224                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2225                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2226                      $"""
2227
2228     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2229
2230     _video_extensions = {
2231         '3500': 'mp4',
2232         '2200': 'mp4',
2233         '1700': 'mp4',
2234         '1200': 'mp4',
2235         '750': 'mp4',
2236         '400': 'mp4',
2237     }
2238     _video_dimensions = {
2239         '3500': '1280x720',
2240         '2200': '960x540',
2241         '1700': '768x432',
2242         '1200': '640x360',
2243         '750': '512x288',
2244         '400': '384x216',
2245     }
2246
2247     def suitable(self, url):
2248         """Receives a URL and returns True if suitable for this IE."""
2249         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2250
2251     def report_extraction(self, episode_id):
2252         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2253
2254     def report_config_download(self, episode_id, media_id):
2255         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2256
2257     def report_index_download(self, episode_id):
2258         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2259
2260     def _print_formats(self, formats):
2261         print('Available formats:')
2262         for x in formats:
2263             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2264
2265
2266     def _real_extract(self, url):
2267         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2268         if mobj is None:
2269             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2270             return
2271
2272         if mobj.group('shortname'):
2273             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2274                 url = u'http://www.thedailyshow.com/full-episodes/'
2275             else:
2276                 url = u'http://www.colbertnation.com/full-episodes/'
2277             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2278             assert mobj is not None
2279
2280         if mobj.group('clip'):
2281             if mobj.group('showname') == 'thedailyshow':
2282                 epTitle = mobj.group('tdstitle')
2283             else:
2284                 epTitle = mobj.group('cntitle')
2285             dlNewest = False
2286         else:
2287             dlNewest = not mobj.group('episode')
2288             if dlNewest:
2289                 epTitle = mobj.group('showname')
2290             else:
2291                 epTitle = mobj.group('episode')
2292
2293         req = compat_urllib_request.Request(url)
2294         self.report_extraction(epTitle)
2295         try:
2296             htmlHandle = compat_urllib_request.urlopen(req)
2297             html = htmlHandle.read()
2298             webpage = html.decode('utf-8')
2299         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2300             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2301             return
2302         if dlNewest:
2303             url = htmlHandle.geturl()
2304             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2305             if mobj is None:
2306                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2307                 return
2308             if mobj.group('episode') == '':
2309                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2310                 return
2311             epTitle = mobj.group('episode')
2312
2313         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2314
2315         if len(mMovieParams) == 0:
2316             # The Colbert Report embeds the information in a without
2317             # a URL prefix; so extract the alternate reference
2318             # and then add the URL prefix manually.
2319
2320             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2321             if len(altMovieParams) == 0:
2322                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2323                 return
2324             else:
2325                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2326
2327         uri = mMovieParams[0][1]
2328         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2329         self.report_index_download(epTitle)
2330         try:
2331             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2332         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2333             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2334             return
2335
2336         results = []
2337
2338         idoc = xml.etree.ElementTree.fromstring(indexXml)
2339         itemEls = idoc.findall('.//item')
2340         for partNum,itemEl in enumerate(itemEls):
2341             mediaId = itemEl.findall('./guid')[0].text
2342             shortMediaId = mediaId.split(':')[-1]
2343             showId = mediaId.split(':')[-2].replace('.com', '')
2344             officialTitle = itemEl.findall('./title')[0].text
2345             officialDate = itemEl.findall('./pubDate')[0].text
2346
2347             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2348                         compat_urllib_parse.urlencode({'uri': mediaId}))
2349             configReq = compat_urllib_request.Request(configUrl)
2350             self.report_config_download(epTitle, shortMediaId)
2351             try:
2352                 configXml = compat_urllib_request.urlopen(configReq).read()
2353             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2354                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2355                 return
2356
2357             cdoc = xml.etree.ElementTree.fromstring(configXml)
2358             turls = []
2359             for rendition in cdoc.findall('.//rendition'):
2360                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2361                 turls.append(finfo)
2362
2363             if len(turls) == 0:
2364                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2365                 continue
2366
2367             if self._downloader.params.get('listformats', None):
2368                 self._print_formats([i[0] for i in turls])
2369                 return
2370
2371             # For now, just pick the highest bitrate
2372             format,rtmp_video_url = turls[-1]
2373
2374             # Get the format arg from the arg stream
2375             req_format = self._downloader.params.get('format', None)
2376
2377             # Select format if we can find one
2378             for f,v in turls:
2379                 if f == req_format:
2380                     format, rtmp_video_url = f, v
2381                     break
2382
2383             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2384             if not m:
2385                 raise ExtractorError(u'Cannot transform RTMP url')
2386             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2387             video_url = base + m.group('finalid')
2388
2389             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2390             info = {
2391                 'id': shortMediaId,
2392                 'url': video_url,
2393                 'uploader': showId,
2394                 'upload_date': officialDate,
2395                 'title': effTitle,
2396                 'ext': 'mp4',
2397                 'format': format,
2398                 'thumbnail': None,
2399                 'description': officialTitle,
2400             }
2401             results.append(info)
2402
2403         return results
2404
2405
2406 class EscapistIE(InfoExtractor):
2407     """Information extractor for The Escapist """
2408
2409     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2410     IE_NAME = u'escapist'
2411
2412     def report_extraction(self, showName):
2413         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2414
2415     def report_config_download(self, showName):
2416         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2417
2418     def _real_extract(self, url):
2419         mobj = re.match(self._VALID_URL, url)
2420         if mobj is None:
2421             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2422             return
2423         showName = mobj.group('showname')
2424         videoId = mobj.group('episode')
2425
2426         self.report_extraction(showName)
2427         try:
2428             webPage = compat_urllib_request.urlopen(url)
2429             webPageBytes = webPage.read()
2430             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2431             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2432         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2433             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2434             return
2435
2436         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2437         description = unescapeHTML(descMatch.group(1))
2438         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2439         imgUrl = unescapeHTML(imgMatch.group(1))
2440         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2441         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2442         configUrlMatch = re.search('config=(.*)$', playerUrl)
2443         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2444
2445         self.report_config_download(showName)
2446         try:
2447             configJSON = compat_urllib_request.urlopen(configUrl)
2448             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2449             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2450         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2451             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2452             return
2453
2454         # Technically, it's JavaScript, not JSON
2455         configJSON = configJSON.replace("'", '"')
2456
2457         try:
2458             config = json.loads(configJSON)
2459         except (ValueError,) as err:
2460             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2461             return
2462
2463         playlist = config['playlist']
2464         videoUrl = playlist[1]['url']
2465
2466         info = {
2467             'id': videoId,
2468             'url': videoUrl,
2469             'uploader': showName,
2470             'upload_date': None,
2471             'title': showName,
2472             'ext': 'flv',
2473             'thumbnail': imgUrl,
2474             'description': description,
2475             'player_url': playerUrl,
2476         }
2477
2478         return [info]
2479
2480 class CollegeHumorIE(InfoExtractor):
2481     """Information extractor for collegehumor.com"""
2482
2483     _WORKING = False
2484     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2485     IE_NAME = u'collegehumor'
2486
2487     def report_manifest(self, video_id):
2488         """Report information extraction."""
2489         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2490
2491     def report_extraction(self, video_id):
2492         """Report information extraction."""
2493         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2494
2495     def _real_extract(self, url):
2496         mobj = re.match(self._VALID_URL, url)
2497         if mobj is None:
2498             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2499             return
2500         video_id = mobj.group('videoid')
2501
2502         info = {
2503             'id': video_id,
2504             'uploader': None,
2505             'upload_date': None,
2506         }
2507
2508         self.report_extraction(video_id)
2509         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2510         try:
2511             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2512         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2513             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2514             return
2515
2516         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2517         try:
2518             videoNode = mdoc.findall('./video')[0]
2519             info['description'] = videoNode.findall('./description')[0].text
2520             info['title'] = videoNode.findall('./caption')[0].text
2521             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2522             manifest_url = videoNode.findall('./file')[0].text
2523         except IndexError:
2524             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2525             return
2526
2527         manifest_url += '?hdcore=2.10.3'
2528         self.report_manifest(video_id)
2529         try:
2530             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2531         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2532             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2533             return
2534
2535         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2536         try:
2537             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2538             node_id = media_node.attrib['url']
2539             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2540         except IndexError as err:
2541             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2542             return
2543
2544         url_pr = compat_urllib_parse_urlparse(manifest_url)
2545         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2546
2547         info['url'] = url
2548         info['ext'] = 'f4f'
2549         return [info]
2550
2551
2552 class XVideosIE(InfoExtractor):
2553     """Information extractor for xvideos.com"""
2554
2555     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2556     IE_NAME = u'xvideos'
2557
2558     def report_extraction(self, video_id):
2559         """Report information extraction."""
2560         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2561
2562     def _real_extract(self, url):
2563         mobj = re.match(self._VALID_URL, url)
2564         if mobj is None:
2565             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2566             return
2567         video_id = mobj.group(1)
2568
2569         webpage = self._download_webpage(url, video_id)
2570
2571         self.report_extraction(video_id)
2572
2573
2574         # Extract video URL
2575         mobj = re.search(r'flv_url=(.+?)&', webpage)
2576         if mobj is None:
2577             self._downloader.trouble(u'ERROR: unable to extract video url')
2578             return
2579         video_url = compat_urllib_parse.unquote(mobj.group(1))
2580
2581
2582         # Extract title
2583         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2584         if mobj is None:
2585             self._downloader.trouble(u'ERROR: unable to extract video title')
2586             return
2587         video_title = mobj.group(1)
2588
2589
2590         # Extract video thumbnail
2591         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2592         if mobj is None:
2593             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2594             return
2595         video_thumbnail = mobj.group(0)
2596
2597         info = {
2598             'id': video_id,
2599             'url': video_url,
2600             'uploader': None,
2601             'upload_date': None,
2602             'title': video_title,
2603             'ext': 'flv',
2604             'thumbnail': video_thumbnail,
2605             'description': None,
2606         }
2607
2608         return [info]
2609
2610
2611 class SoundcloudIE(InfoExtractor):
2612     """Information extractor for soundcloud.com
2613        To access the media, the uid of the song and a stream token
2614        must be extracted from the page source and the script must make
2615        a request to media.soundcloud.com/crossdomain.xml. Then
2616        the media can be grabbed by requesting from an url composed
2617        of the stream token and uid
2618      """
2619
2620     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2621     IE_NAME = u'soundcloud'
2622
2623     def __init__(self, downloader=None):
2624         InfoExtractor.__init__(self, downloader)
2625
2626     def report_resolve(self, video_id):
2627         """Report information extraction."""
2628         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2629
2630     def report_extraction(self, video_id):
2631         """Report information extraction."""
2632         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2633
2634     def _real_extract(self, url):
2635         mobj = re.match(self._VALID_URL, url)
2636         if mobj is None:
2637             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2638             return
2639
2640         # extract uploader (which is in the url)
2641         uploader = mobj.group(1)
2642         # extract simple title (uploader + slug of song title)
2643         slug_title =  mobj.group(2)
2644         simple_title = uploader + u'-' + slug_title
2645
2646         self.report_resolve('%s/%s' % (uploader, slug_title))
2647
2648         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2649         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2650         request = compat_urllib_request.Request(resolv_url)
2651         try:
2652             info_json_bytes = compat_urllib_request.urlopen(request).read()
2653             info_json = info_json_bytes.decode('utf-8')
2654         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2655             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2656             return
2657
2658         info = json.loads(info_json)
2659         video_id = info['id']
2660         self.report_extraction('%s/%s' % (uploader, slug_title))
2661
2662         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2663         request = compat_urllib_request.Request(streams_url)
2664         try:
2665             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2666             stream_json = stream_json_bytes.decode('utf-8')
2667         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2668             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2669             return
2670
2671         streams = json.loads(stream_json)
2672         mediaURL = streams['http_mp3_128_url']
2673
2674         return [{
2675             'id':       info['id'],
2676             'url':      mediaURL,
2677             'uploader': info['user']['username'],
2678             'upload_date':  info['created_at'],
2679             'title':    info['title'],
2680             'ext':      u'mp3',
2681             'description': info['description'],
2682         }]
2683
2684
2685 class InfoQIE(InfoExtractor):
2686     """Information extractor for infoq.com"""
2687     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2688
2689     def report_extraction(self, video_id):
2690         """Report information extraction."""
2691         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2692
2693     def _real_extract(self, url):
2694         mobj = re.match(self._VALID_URL, url)
2695         if mobj is None:
2696             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2697             return
2698
2699         webpage = self._download_webpage(url, video_id=url)
2700         self.report_extraction(url)
2701
2702         # Extract video URL
2703         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2704         if mobj is None:
2705             self._downloader.trouble(u'ERROR: unable to extract video url')
2706             return
2707         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2708         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2709
2710         # Extract title
2711         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2712         if mobj is None:
2713             self._downloader.trouble(u'ERROR: unable to extract video title')
2714             return
2715         video_title = mobj.group(1)
2716
2717         # Extract description
2718         video_description = u'No description available.'
2719         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2720         if mobj is not None:
2721             video_description = mobj.group(1)
2722
2723         video_filename = video_url.split('/')[-1]
2724         video_id, extension = video_filename.split('.')
2725
2726         info = {
2727             'id': video_id,
2728             'url': video_url,
2729             'uploader': None,
2730             'upload_date': None,
2731             'title': video_title,
2732             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2733             'thumbnail': None,
2734             'description': video_description,
2735         }
2736
2737         return [info]
2738
2739 class MixcloudIE(InfoExtractor):
2740     """Information extractor for www.mixcloud.com"""
2741
2742     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2743     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2744     IE_NAME = u'mixcloud'
2745
2746     def __init__(self, downloader=None):
2747         InfoExtractor.__init__(self, downloader)
2748
2749     def report_download_json(self, file_id):
2750         """Report JSON download."""
2751         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2752
2753     def report_extraction(self, file_id):
2754         """Report information extraction."""
2755         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2756
2757     def get_urls(self, jsonData, fmt, bitrate='best'):
2758         """Get urls from 'audio_formats' section in json"""
2759         file_url = None
2760         try:
2761             bitrate_list = jsonData[fmt]
2762             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2763                 bitrate = max(bitrate_list) # select highest
2764
2765             url_list = jsonData[fmt][bitrate]
2766         except TypeError: # we have no bitrate info.
2767             url_list = jsonData[fmt]
2768         return url_list
2769
2770     def check_urls(self, url_list):
2771         """Returns 1st active url from list"""
2772         for url in url_list:
2773             try:
2774                 compat_urllib_request.urlopen(url)
2775                 return url
2776             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2777                 url = None
2778
2779         return None
2780
2781     def _print_formats(self, formats):
2782         print('Available formats:')
2783         for fmt in formats.keys():
2784             for b in formats[fmt]:
2785                 try:
2786                     ext = formats[fmt][b][0]
2787                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2788                 except TypeError: # we have no bitrate info
2789                     ext = formats[fmt][0]
2790                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2791                     break
2792
2793     def _real_extract(self, url):
2794         mobj = re.match(self._VALID_URL, url)
2795         if mobj is None:
2796             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2797             return
2798         # extract uploader & filename from url
2799         uploader = mobj.group(1).decode('utf-8')
2800         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2801
2802         # construct API request
2803         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2804         # retrieve .json file with links to files
2805         request = compat_urllib_request.Request(file_url)
2806         try:
2807             self.report_download_json(file_url)
2808             jsonData = compat_urllib_request.urlopen(request).read()
2809         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2810             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2811             return
2812
2813         # parse JSON
2814         json_data = json.loads(jsonData)
2815         player_url = json_data['player_swf_url']
2816         formats = dict(json_data['audio_formats'])
2817
2818         req_format = self._downloader.params.get('format', None)
2819         bitrate = None
2820
2821         if self._downloader.params.get('listformats', None):
2822             self._print_formats(formats)
2823             return
2824
2825         if req_format is None or req_format == 'best':
2826             for format_param in formats.keys():
2827                 url_list = self.get_urls(formats, format_param)
2828                 # check urls
2829                 file_url = self.check_urls(url_list)
2830                 if file_url is not None:
2831                     break # got it!
2832         else:
2833             if req_format not in formats:
2834                 self._downloader.trouble(u'ERROR: format is not available')
2835                 return
2836
2837             url_list = self.get_urls(formats, req_format)
2838             file_url = self.check_urls(url_list)
2839             format_param = req_format
2840
2841         return [{
2842             'id': file_id.decode('utf-8'),
2843             'url': file_url.decode('utf-8'),
2844             'uploader': uploader.decode('utf-8'),
2845             'upload_date': None,
2846             'title': json_data['name'],
2847             'ext': file_url.split('.')[-1].decode('utf-8'),
2848             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2849             'thumbnail': json_data['thumbnail_url'],
2850             'description': json_data['description'],
2851             'player_url': player_url.decode('utf-8'),
2852         }]
2853
2854 class StanfordOpenClassroomIE(InfoExtractor):
2855     """Information extractor for Stanford's Open ClassRoom"""
2856
2857     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2858     IE_NAME = u'stanfordoc'
2859
2860     def report_download_webpage(self, objid):
2861         """Report information extraction."""
2862         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2863
2864     def report_extraction(self, video_id):
2865         """Report information extraction."""
2866         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2867
2868     def _real_extract(self, url):
2869         mobj = re.match(self._VALID_URL, url)
2870         if mobj is None:
2871             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2872             return
2873
2874         if mobj.group('course') and mobj.group('video'): # A specific video
2875             course = mobj.group('course')
2876             video = mobj.group('video')
2877             info = {
2878                 'id': course + '_' + video,
2879                 'uploader': None,
2880                 'upload_date': None,
2881             }
2882
2883             self.report_extraction(info['id'])
2884             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2885             xmlUrl = baseUrl + video + '.xml'
2886             try:
2887                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2888             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2889                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2890                 return
2891             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2892             try:
2893                 info['title'] = mdoc.findall('./title')[0].text
2894                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2895             except IndexError:
2896                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2897                 return
2898             info['ext'] = info['url'].rpartition('.')[2]
2899             return [info]
2900         elif mobj.group('course'): # A course page
2901             course = mobj.group('course')
2902             info = {
2903                 'id': course,
2904                 'type': 'playlist',
2905                 'uploader': None,
2906                 'upload_date': None,
2907             }
2908
2909             self.report_download_webpage(info['id'])
2910             try:
2911                 coursepage = compat_urllib_request.urlopen(url).read()
2912             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2913                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2914                 return
2915
2916             m = re.search('<h1>([^<]+)</h1>', coursepage)
2917             if m:
2918                 info['title'] = unescapeHTML(m.group(1))
2919             else:
2920                 info['title'] = info['id']
2921
2922             m = re.search('<description>([^<]+)</description>', coursepage)
2923             if m:
2924                 info['description'] = unescapeHTML(m.group(1))
2925
2926             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2927             info['list'] = [
2928                 {
2929                     'type': 'reference',
2930                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2931                 }
2932                     for vpage in links]
2933             results = []
2934             for entry in info['list']:
2935                 assert entry['type'] == 'reference'
2936                 results += self.extract(entry['url'])
2937             return results
2938
2939         else: # Root page
2940             info = {
2941                 'id': 'Stanford OpenClassroom',
2942                 'type': 'playlist',
2943                 'uploader': None,
2944                 'upload_date': None,
2945             }
2946
2947             self.report_download_webpage(info['id'])
2948             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2949             try:
2950                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2951             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2952                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2953                 return
2954
2955             info['title'] = info['id']
2956
2957             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2958             info['list'] = [
2959                 {
2960                     'type': 'reference',
2961                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2962                 }
2963                     for cpage in links]
2964
2965             results = []
2966             for entry in info['list']:
2967                 assert entry['type'] == 'reference'
2968                 results += self.extract(entry['url'])
2969             return results
2970
2971 class MTVIE(InfoExtractor):
2972     """Information extractor for MTV.com"""
2973
2974     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2975     IE_NAME = u'mtv'
2976
2977     def report_extraction(self, video_id):
2978         """Report information extraction."""
2979         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2980
2981     def _real_extract(self, url):
2982         mobj = re.match(self._VALID_URL, url)
2983         if mobj is None:
2984             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2985             return
2986         if not mobj.group('proto'):
2987             url = 'http://' + url
2988         video_id = mobj.group('videoid')
2989
2990         webpage = self._download_webpage(url, video_id)
2991
2992         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2993         if mobj is None:
2994             self._downloader.trouble(u'ERROR: unable to extract song name')
2995             return
2996         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2997         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2998         if mobj is None:
2999             self._downloader.trouble(u'ERROR: unable to extract performer')
3000             return
3001         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3002         video_title = performer + ' - ' + song_name
3003
3004         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3005         if mobj is None:
3006             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3007             return
3008         mtvn_uri = mobj.group(1)
3009
3010         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3011         if mobj is None:
3012             self._downloader.trouble(u'ERROR: unable to extract content id')
3013             return
3014         content_id = mobj.group(1)
3015
3016         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3017         self.report_extraction(video_id)
3018         request = compat_urllib_request.Request(videogen_url)
3019         try:
3020             metadataXml = compat_urllib_request.urlopen(request).read()
3021         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3022             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3023             return
3024
3025         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3026         renditions = mdoc.findall('.//rendition')
3027
3028         # For now, always pick the highest quality.
3029         rendition = renditions[-1]
3030
3031         try:
3032             _,_,ext = rendition.attrib['type'].partition('/')
3033             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3034             video_url = rendition.find('./src').text
3035         except KeyError:
3036             self._downloader.trouble('Invalid rendition field.')
3037             return
3038
3039         info = {
3040             'id': video_id,
3041             'url': video_url,
3042             'uploader': performer,
3043             'upload_date': None,
3044             'title': video_title,
3045             'ext': ext,
3046             'format': format,
3047         }
3048
3049         return [info]
3050
3051
3052 class YoukuIE(InfoExtractor):
3053     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3054
3055     def report_download_webpage(self, file_id):
3056         """Report webpage download."""
3057         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3058
3059     def report_extraction(self, file_id):
3060         """Report information extraction."""
3061         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3062
3063     def _gen_sid(self):
3064         nowTime = int(time.time() * 1000)
3065         random1 = random.randint(1000,1998)
3066         random2 = random.randint(1000,9999)
3067
3068         return "%d%d%d" %(nowTime,random1,random2)
3069
3070     def _get_file_ID_mix_string(self, seed):
3071         mixed = []
3072         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3073         seed = float(seed)
3074         for i in range(len(source)):
3075             seed  =  (seed * 211 + 30031 ) % 65536
3076             index  =  math.floor(seed / 65536 * len(source) )
3077             mixed.append(source[int(index)])
3078             source.remove(source[int(index)])
3079         #return ''.join(mixed)
3080         return mixed
3081
3082     def _get_file_id(self, fileId, seed):
3083         mixed = self._get_file_ID_mix_string(seed)
3084         ids = fileId.split('*')
3085         realId = []
3086         for ch in ids:
3087             if ch:
3088                 realId.append(mixed[int(ch)])
3089         return ''.join(realId)
3090
3091     def _real_extract(self, url):
3092         mobj = re.match(self._VALID_URL, url)
3093         if mobj is None:
3094             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3095             return
3096         video_id = mobj.group('ID')
3097
3098         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3099
3100         request = compat_urllib_request.Request(info_url, None, std_headers)
3101         try:
3102             self.report_download_webpage(video_id)
3103             jsondata = compat_urllib_request.urlopen(request).read()
3104         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3105             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3106             return
3107
3108         self.report_extraction(video_id)
3109         try:
3110             jsonstr = jsondata.decode('utf-8')
3111             config = json.loads(jsonstr)
3112
3113             video_title =  config['data'][0]['title']
3114             seed = config['data'][0]['seed']
3115
3116             format = self._downloader.params.get('format', None)
3117             supported_format = list(config['data'][0]['streamfileids'].keys())
3118
3119             if format is None or format == 'best':
3120                 if 'hd2' in supported_format:
3121                     format = 'hd2'
3122                 else:
3123                     format = 'flv'
3124                 ext = u'flv'
3125             elif format == 'worst':
3126                 format = 'mp4'
3127                 ext = u'mp4'
3128             else:
3129                 format = 'flv'
3130                 ext = u'flv'
3131
3132
3133             fileid = config['data'][0]['streamfileids'][format]
3134             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3135         except (UnicodeDecodeError, ValueError, KeyError):
3136             self._downloader.trouble(u'ERROR: unable to extract info section')
3137             return
3138
3139         files_info=[]
3140         sid = self._gen_sid()
3141         fileid = self._get_file_id(fileid, seed)
3142
3143         #column 8,9 of fileid represent the segment number
3144         #fileid[7:9] should be changed
3145         for index, key in enumerate(keys):
3146
3147             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3148             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3149
3150             info = {
3151                 'id': '%s_part%02d' % (video_id, index),
3152                 'url': download_url,
3153                 'uploader': None,
3154                 'upload_date': None,
3155                 'title': video_title,
3156                 'ext': ext,
3157             }
3158             files_info.append(info)
3159
3160         return files_info
3161
3162
3163 class XNXXIE(InfoExtractor):
3164     """Information extractor for xnxx.com"""
3165
3166     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3167     IE_NAME = u'xnxx'
3168     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3169     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3170     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3171
3172     def report_webpage(self, video_id):
3173         """Report information extraction"""
3174         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3175
3176     def report_extraction(self, video_id):
3177         """Report information extraction"""
3178         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3179
3180     def _real_extract(self, url):
3181         mobj = re.match(self._VALID_URL, url)
3182         if mobj is None:
3183             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3184             return
3185         video_id = mobj.group(1)
3186
3187         self.report_webpage(video_id)
3188
3189         # Get webpage content
3190         try:
3191             webpage_bytes = compat_urllib_request.urlopen(url).read()
3192             webpage = webpage_bytes.decode('utf-8')
3193         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3194             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3195             return
3196
3197         result = re.search(self.VIDEO_URL_RE, webpage)
3198         if result is None:
3199             self._downloader.trouble(u'ERROR: unable to extract video url')
3200             return
3201         video_url = compat_urllib_parse.unquote(result.group(1))
3202
3203         result = re.search(self.VIDEO_TITLE_RE, webpage)
3204         if result is None:
3205             self._downloader.trouble(u'ERROR: unable to extract video title')
3206             return
3207         video_title = result.group(1)
3208
3209         result = re.search(self.VIDEO_THUMB_RE, webpage)
3210         if result is None:
3211             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3212             return
3213         video_thumbnail = result.group(1)
3214
3215         return [{
3216             'id': video_id,
3217             'url': video_url,
3218             'uploader': None,
3219             'upload_date': None,
3220             'title': video_title,
3221             'ext': 'flv',
3222             'thumbnail': video_thumbnail,
3223             'description': None,
3224         }]
3225
3226
3227 class GooglePlusIE(InfoExtractor):
3228     """Information extractor for plus.google.com."""
3229
3230     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3231     IE_NAME = u'plus.google'
3232
3233     def __init__(self, downloader=None):
3234         InfoExtractor.__init__(self, downloader)
3235
3236     def report_extract_entry(self, url):
3237         """Report downloading extry"""
3238         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3239
3240     def report_date(self, upload_date):
3241         """Report downloading extry"""
3242         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3243
3244     def report_uploader(self, uploader):
3245         """Report downloading extry"""
3246         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3247
3248     def report_title(self, video_title):
3249         """Report downloading extry"""
3250         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3251
3252     def report_extract_vid_page(self, video_page):
3253         """Report information extraction."""
3254         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3255
3256     def _real_extract(self, url):
3257         # Extract id from URL
3258         mobj = re.match(self._VALID_URL, url)
3259         if mobj is None:
3260             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3261             return
3262
3263         post_url = mobj.group(0)
3264         video_id = mobj.group(1)
3265
3266         video_extension = 'flv'
3267
3268         # Step 1, Retrieve post webpage to extract further information
3269         self.report_extract_entry(post_url)
3270         request = compat_urllib_request.Request(post_url)
3271         try:
3272             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3273         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3274             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3275             return
3276
3277         # Extract update date
3278         upload_date = None
3279         pattern = 'title="Timestamp">(.*?)</a>'
3280         mobj = re.search(pattern, webpage)
3281         if mobj:
3282             upload_date = mobj.group(1)
3283             # Convert timestring to a format suitable for filename
3284             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3285             upload_date = upload_date.strftime('%Y%m%d')
3286         self.report_date(upload_date)
3287
3288         # Extract uploader
3289         uploader = None
3290         pattern = r'rel\="author".*?>(.*?)</a>'
3291         mobj = re.search(pattern, webpage)
3292         if mobj:
3293             uploader = mobj.group(1)
3294         self.report_uploader(uploader)
3295
3296         # Extract title
3297         # Get the first line for title
3298         video_title = u'NA'
3299         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3300         mobj = re.search(pattern, webpage)
3301         if mobj:
3302             video_title = mobj.group(1)
3303         self.report_title(video_title)
3304
3305         # Step 2, Stimulate clicking the image box to launch video
3306         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3307         mobj = re.search(pattern, webpage)
3308         if mobj is None:
3309             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3310
3311         video_page = mobj.group(1)
3312         request = compat_urllib_request.Request(video_page)
3313         try:
3314             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3315         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3316             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3317             return
3318         self.report_extract_vid_page(video_page)
3319
3320
3321         # Extract video links on video page
3322         """Extract video links of all sizes"""
3323         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3324         mobj = re.findall(pattern, webpage)
3325         if len(mobj) == 0:
3326             self._downloader.trouble(u'ERROR: unable to extract video links')
3327
3328         # Sort in resolution
3329         links = sorted(mobj)
3330
3331         # Choose the lowest of the sort, i.e. highest resolution
3332         video_url = links[-1]
3333         # Only get the url. The resolution part in the tuple has no use anymore
3334         video_url = video_url[-1]
3335         # Treat escaped \u0026 style hex
3336         try:
3337             video_url = video_url.decode("unicode_escape")
3338         except AttributeError: # Python 3
3339             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3340
3341
3342         return [{
3343             'id':       video_id,
3344             'url':      video_url,
3345             'uploader': uploader,
3346             'upload_date':  upload_date,
3347             'title':    video_title,
3348             'ext':      video_extension,
3349         }]
3350
3351 class NBAIE(InfoExtractor):
3352     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3353     IE_NAME = u'nba'
3354
3355     def _real_extract(self, url):
3356         mobj = re.match(self._VALID_URL, url)
3357         if mobj is None:
3358             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3359             return
3360
3361         video_id = mobj.group(1)
3362         if video_id.endswith('/index.html'):
3363             video_id = video_id[:-len('/index.html')]
3364
3365         webpage = self._download_webpage(url, video_id)
3366
3367         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3368         def _findProp(rexp, default=None):
3369             m = re.search(rexp, webpage)
3370             if m:
3371                 return unescapeHTML(m.group(1))
3372             else:
3373                 return default
3374
3375         shortened_video_id = video_id.rpartition('/')[2]
3376         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3377         info = {
3378             'id': shortened_video_id,
3379             'url': video_url,
3380             'ext': 'mp4',
3381             'title': title,
3382             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3383             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3384         }
3385         return [info]
3386
3387 class JustinTVIE(InfoExtractor):
3388     """Information extractor for justin.tv and twitch.tv"""
3389     # TODO: One broadcast may be split into multiple videos. The key
3390     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3391     # starts at 1 and increases. Can we treat all parts as one video?
3392
3393     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3394         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3395     _JUSTIN_PAGE_LIMIT = 100
3396     IE_NAME = u'justin.tv'
3397
3398     def report_extraction(self, file_id):
3399         """Report information extraction."""
3400         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3401
3402     def report_download_page(self, channel, offset):
3403         """Report attempt to download a single page of videos."""
3404         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3405                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3406
3407     # Return count of items, list of *valid* items
3408     def _parse_page(self, url):
3409         try:
3410             urlh = compat_urllib_request.urlopen(url)
3411             webpage_bytes = urlh.read()
3412             webpage = webpage_bytes.decode('utf-8', 'ignore')
3413         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3414             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3415             return
3416
3417         response = json.loads(webpage)
3418         if type(response) != list:
3419             error_text = response.get('error', 'unknown error')
3420             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3421             return
3422         info = []
3423         for clip in response:
3424             video_url = clip['video_file_url']
3425             if video_url:
3426                 video_extension = os.path.splitext(video_url)[1][1:]
3427                 video_date = re.sub('-', '', clip['start_time'][:10])
3428                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3429                 video_id = clip['id']
3430                 video_title = clip.get('title', video_id)
3431                 info.append({
3432                     'id': video_id,
3433                     'url': video_url,
3434                     'title': video_title,
3435                     'uploader': clip.get('channel_name', video_uploader_id),
3436                     'uploader_id': video_uploader_id,
3437                     'upload_date': video_date,
3438                     'ext': video_extension,
3439                 })
3440         return (len(response), info)
3441
3442     def _real_extract(self, url):
3443         mobj = re.match(self._VALID_URL, url)
3444         if mobj is None:
3445             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3446             return
3447
3448         api = 'http://api.justin.tv'
3449         video_id = mobj.group(mobj.lastindex)
3450         paged = False
3451         if mobj.lastindex == 1:
3452             paged = True
3453             api += '/channel/archives/%s.json'
3454         else:
3455             api += '/broadcast/by_archive/%s.json'
3456         api = api % (video_id,)
3457
3458         self.report_extraction(video_id)
3459
3460         info = []
3461         offset = 0
3462         limit = self._JUSTIN_PAGE_LIMIT
3463         while True:
3464             if paged:
3465                 self.report_download_page(video_id, offset)
3466             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3467             page_count, page_info = self._parse_page(page_url)
3468             info.extend(page_info)
3469             if not paged or page_count != limit:
3470                 break
3471             offset += limit
3472         return info
3473
3474 class FunnyOrDieIE(InfoExtractor):
3475     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3476
3477     def _real_extract(self, url):
3478         mobj = re.match(self._VALID_URL, url)
3479         if mobj is None:
3480             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3481             return
3482
3483         video_id = mobj.group('id')
3484         webpage = self._download_webpage(url, video_id)
3485
3486         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3487         if not m:
3488             self._downloader.trouble(u'ERROR: unable to find video information')
3489         video_url = unescapeHTML(m.group('url'))
3490
3491         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3492         if not m:
3493             self._downloader.trouble(u'Cannot find video title')
3494         title = unescapeHTML(m.group('title'))
3495
3496         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3497         if m:
3498             desc = unescapeHTML(m.group('desc'))
3499         else:
3500             desc = None
3501
3502         info = {
3503             'id': video_id,
3504             'url': video_url,
3505             'ext': 'mp4',
3506             'title': title,
3507             'description': desc,
3508         }
3509         return [info]
3510
3511 class TweetReelIE(InfoExtractor):
3512     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3513
3514     def _real_extract(self, url):
3515         mobj = re.match(self._VALID_URL, url)
3516         if mobj is None:
3517             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3518             return
3519
3520         video_id = mobj.group('id')
3521         webpage = self._download_webpage(url, video_id)
3522
3523         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3524         if not m:
3525             self._downloader.trouble(u'ERROR: Cannot find status ID')
3526         status_id = m.group(1)
3527
3528         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3529         if not m:
3530             self._downloader.trouble(u'WARNING: Cannot find description')
3531         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3532
3533         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3534         if not m:
3535             self._downloader.trouble(u'ERROR: Cannot find uploader')
3536         uploader = unescapeHTML(m.group('uploader'))
3537         uploader_id = unescapeHTML(m.group('uploader_id'))
3538
3539         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3540         if not m:
3541             self._downloader.trouble(u'ERROR: Cannot find upload date')
3542         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3543
3544         title = desc
3545         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3546
3547         info = {
3548             'id': video_id,
3549             'url': video_url,
3550             'ext': 'mov',
3551             'title': title,
3552             'description': desc,
3553             'uploader': uploader,
3554             'uploader_id': uploader_id,
3555             'internal_id': status_id,
3556             'upload_date': upload_date
3557         }
3558         return [info]
3559
3560 class SteamIE(InfoExtractor):
3561     _VALID_URL = r"""http://store.steampowered.com/
3562                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3563                 (?P<gameID>\d+)/?
3564                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3565                 """
3566
3567     def suitable(self, url):
3568         """Receives a URL and returns True if suitable for this IE."""
3569         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3570
3571     def _real_extract(self, url):
3572         m = re.match(self._VALID_URL, url, re.VERBOSE)
3573         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3574         gameID = m.group('gameID')
3575         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3576         webpage = self._download_webpage(videourl, gameID)
3577         mweb = re.finditer(urlRE, webpage)
3578         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3579         titles = re.finditer(namesRE, webpage)
3580         videos = []
3581         for vid,vtitle in zip(mweb,titles):
3582             video_id = vid.group('videoID')
3583             title = vtitle.group('videoName')
3584             video_url = vid.group('videoURL')
3585             if not video_url:
3586                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3587             info = {
3588                 'id':video_id,
3589                 'url':video_url,
3590                 'ext': 'flv',
3591                 'title': unescapeHTML(title)
3592                   }
3593             videos.append(info)
3594         return videos
3595
3596 class UstreamIE(InfoExtractor):
3597     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3598     IE_NAME = u'ustream'
3599
3600     def _real_extract(self, url):
3601         m = re.match(self._VALID_URL, url)
3602         video_id = m.group('videoID')
3603         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3604         webpage = self._download_webpage(url, video_id)
3605         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3606         title = m.group('title')
3607         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3608         uploader = m.group('uploader')
3609         info = {
3610                 'id':video_id,
3611                 'url':video_url,
3612                 'ext': 'flv',
3613                 'title': title,
3614                 'uploader': uploader
3615                   }
3616         return [info]
3617
3618 class RBMARadioIE(InfoExtractor):
3619     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3620
3621     def _real_extract(self, url):
3622         m = re.match(self._VALID_URL, url)
3623         video_id = m.group('videoID')
3624
3625         webpage = self._download_webpage(url, video_id)
3626         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3627         if not m:
3628             raise ExtractorError(u'Cannot find metadata')
3629         json_data = m.group(1)
3630
3631         try:
3632             data = json.loads(json_data)
3633         except ValueError as e:
3634             raise ExtractorError(u'Invalid JSON: ' + str(e))
3635
3636         video_url = data['akamai_url'] + '&cbr=256'
3637         url_parts = compat_urllib_parse_urlparse(video_url)
3638         video_ext = url_parts.path.rpartition('.')[2]
3639         info = {
3640                 'id': video_id,
3641                 'url': video_url,
3642                 'ext': video_ext,
3643                 'title': data['title'],
3644                 'description': data.get('teaser_text'),
3645                 'location': data.get('country_of_origin'),
3646                 'uploader': data.get('host', {}).get('name'),
3647                 'uploader_id': data.get('host', {}).get('slug'),
3648                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3649                 'duration': data.get('duration'),
3650         }
3651         return [info]
3652
3653
3654 class YouPornIE(InfoExtractor):
3655     """Information extractor for youporn.com."""
3656     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3657
3658     def _print_formats(self, formats):
3659         """Print all available formats"""
3660         print(u'Available formats:')
3661         print(u'ext\t\tformat')
3662         print(u'---------------------------------')
3663         for format in formats:
3664             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3665
3666     def _specific(self, req_format, formats):
3667         for x in formats:
3668             if(x["format"]==req_format):
3669                 return x
3670         return None
3671
3672     def _real_extract(self, url):
3673         mobj = re.match(self._VALID_URL, url)
3674         if mobj is None:
3675             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3676             return
3677
3678         video_id = mobj.group('videoid')
3679
3680         req = compat_urllib_request.Request(url)
3681         req.add_header('Cookie', 'age_verified=1')
3682         webpage = self._download_webpage(req, video_id)
3683
3684         # Get the video title
3685         result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3686         if result is None:
3687             raise ExtractorError(u'ERROR: unable to extract video title')
3688         video_title = result.group('title').strip()
3689
3690         # Get the video date
3691         result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3692         if result is None:
3693             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3694             upload_date = None
3695         else:
3696             upload_date = result.group('date').strip()
3697
3698         # Get the video uploader
3699         result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3700         if result is None:
3701             self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3702             video_uploader = None
3703         else:
3704             video_uploader = result.group('uploader').strip()
3705             video_uploader = clean_html( video_uploader )
3706
3707         # Get all of the formats available
3708         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3709         result = re.search(DOWNLOAD_LIST_RE, webpage)
3710         if result is None:
3711             raise ExtractorError(u'Unable to extract download list')
3712         download_list_html = result.group('download_list').strip()
3713
3714         # Get all of the links from the page
3715         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3716         links = re.findall(LINK_RE, download_list_html)
3717         if(len(links) == 0):
3718             raise ExtractorError(u'ERROR: no known formats available for video')
3719
3720         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3721
3722         formats = []
3723         for link in links:
3724
3725             # A link looks like this:
3726             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3727             # A path looks like this:
3728             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3729             video_url = unescapeHTML( link )
3730             path = compat_urllib_parse_urlparse( video_url ).path
3731             extension = os.path.splitext( path )[1][1:]
3732             format = path.split('/')[4].split('_')[:2]
3733             size = format[0]
3734             bitrate = format[1]
3735             format = "-".join( format )
3736             title = u'%s-%s-%s' % (video_title, size, bitrate)
3737
3738             formats.append({
3739                 'id': video_id,
3740                 'url': video_url,
3741                 'uploader': video_uploader,
3742                 'upload_date': upload_date,
3743                 'title': title,
3744                 'ext': extension,
3745                 'format': format,
3746                 'thumbnail': None,
3747                 'description': None,
3748                 'player_url': None
3749             })
3750
3751         if self._downloader.params.get('listformats', None):
3752             self._print_formats(formats)
3753             return
3754
3755         req_format = self._downloader.params.get('format', None)
3756         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3757
3758         if req_format is None or req_format == 'best':
3759             return [formats[0]]
3760         elif req_format == 'worst':
3761             return [formats[-1]]
3762         elif req_format in ('-1', 'all'):
3763             return formats
3764         else:
3765             format = self._specific( req_format, formats )
3766             if result is None:
3767                 self._downloader.trouble(u'ERROR: requested format not available')
3768                 return
3769             return [format]
3770
3771
3772
3773 class PornotubeIE(InfoExtractor):
3774     """Information extractor for pornotube.com."""
3775     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3776
3777     def _real_extract(self, url):
3778         mobj = re.match(self._VALID_URL, url)
3779         if mobj is None:
3780             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3781             return
3782
3783         video_id = mobj.group('videoid')
3784         video_title = mobj.group('title')
3785
3786         # Get webpage content
3787         webpage = self._download_webpage(url, video_id)
3788
3789         # Get the video URL
3790         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3791         result = re.search(VIDEO_URL_RE, webpage)
3792         if result is None:
3793             self._downloader.trouble(u'ERROR: unable to extract video url')
3794             return
3795         video_url = compat_urllib_parse.unquote(result.group('url'))
3796
3797         #Get the uploaded date
3798         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3799         result = re.search(VIDEO_UPLOADED_RE, webpage)
3800         if result is None:
3801             self._downloader.trouble(u'ERROR: unable to extract video title')
3802             return
3803         upload_date = result.group('date')
3804
3805         info = {'id': video_id,
3806                 'url': video_url,
3807                 'uploader': None,
3808                 'upload_date': upload_date,
3809                 'title': video_title,
3810                 'ext': 'flv',
3811                 'format': 'flv'}
3812
3813         return [info]
3814
3815
3816
3817 class YouJizzIE(InfoExtractor):
3818     """Information extractor for youjizz.com."""
3819     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3820
3821     def _real_extract(self, url):
3822         mobj = re.match(self._VALID_URL, url)
3823         if mobj is None:
3824             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3825             return
3826
3827         video_id = mobj.group('videoid')
3828
3829         # Get webpage content
3830         webpage = self._download_webpage(url, video_id)
3831
3832         # Get the video title
3833         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3834         if result is None:
3835             raise ExtractorError(u'ERROR: unable to extract video title')
3836         video_title = result.group('title').strip()
3837
3838         # Get the embed page
3839         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3840         if result is None:
3841             raise ExtractorError(u'ERROR: unable to extract embed page')
3842
3843         embed_page_url = result.group(0).strip()
3844         video_id = result.group('videoid')
3845
3846         webpage = self._download_webpage(embed_page_url, video_id)
3847
3848         # Get the video URL
3849         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3850         if result is None:
3851             raise ExtractorError(u'ERROR: unable to extract video url')
3852         video_url = result.group('source')
3853
3854         info = {'id': video_id,
3855                 'url': video_url,
3856                 'title': video_title,
3857                 'ext': 'flv',
3858                 'format': 'flv',
3859                 'player_url': embed_page_url}
3860
3861         return [info]
3862
3863
3864 def gen_extractors():
3865     """ Return a list of an instance of every supported extractor.
3866     The order does matter; the first extractor matched is the one handling the URL.
3867     """
3868     return [
3869         YoutubePlaylistIE(),
3870         YoutubeChannelIE(),
3871         YoutubeUserIE(),
3872         YoutubeSearchIE(),
3873         YoutubeIE(),
3874         MetacafeIE(),
3875         DailymotionIE(),
3876         GoogleSearchIE(),
3877         PhotobucketIE(),
3878         YahooIE(),
3879         YahooSearchIE(),
3880         DepositFilesIE(),
3881         FacebookIE(),
3882         BlipTVUserIE(),
3883         BlipTVIE(),
3884         VimeoIE(),
3885         MyVideoIE(),
3886         ComedyCentralIE(),
3887         EscapistIE(),
3888         CollegeHumorIE(),
3889         XVideosIE(),
3890         SoundcloudIE(),
3891         InfoQIE(),
3892         MixcloudIE(),
3893         StanfordOpenClassroomIE(),
3894         MTVIE(),
3895         YoukuIE(),
3896         XNXXIE(),
3897         YouJizzIE(),
3898         PornotubeIE(),
3899         YouPornIE(),
3900         GooglePlusIE(),
3901         ArteTvIE(),
3902         NBAIE(),
3903         JustinTVIE(),
3904         FunnyOrDieIE(),
3905         TweetReelIE(),
3906         SteamIE(),
3907         UstreamIE(),
3908         RBMARadioIE(),
3909         GenericIE()
3910     ]
3911
3912