youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import netrc
   9 import os
  10 import re
  11 import socket
  12 import time
  13 import email.utils
  14 import xml.etree.ElementTree
  15 import random
  16 import math
  17
  18 from .utils import *
  19
  20
  21 class InfoExtractor(object):
  22     """Information Extractor class.
  23
  24     Information extractors are the classes that, given a URL, extract
  25     information about the video (or videos) the URL refers to. This
  26     information includes the real video URL, the video title, author and
  27     others. The information is stored in a dictionary which is then
  28     passed to the FileDownloader. The FileDownloader processes this
  29     information possibly downloading the video to the file system, among
  30     other possible outcomes.
  31
  32     The dictionaries must include the following fields:
  33
  34     id:             Video identifier.
  35     url:            Final video URL.
  36     title:          Video title, unescaped.
  37     ext:            Video filename extension.
  38     uploader:       Full name of the video uploader.
  39     upload_date:    Video upload date (YYYYMMDD).
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader_id:    Nickname or id of the video uploader.
  47     player_url:     SWF Player URL (used for rtmpdump).
  48     subtitles:      The .srt file contents.
  49     urlhandle:      [internal] The urlHandle to be used to download the file,
  50                     like returned by urllib.request.urlopen
  51
  52     The fields should all be Unicode strings.
  53
  54     Subclasses of this one should re-define the _real_initialize() and
  55     _real_extract() methods and define a _VALID_URL regexp.
  56     Probably, they should also be added to the list of extractors.
  57
  58     _real_extract() must return a *list* of information dictionaries as
  59     described above.
  60
  61     Finally, the _WORKING attribute should be set to False for broken IEs
  62     in order to warn the users and skip the tests.
  63     """
  64
  65     _ready = False
  66     _downloader = None
  67     _WORKING = True
  68
  69     def __init__(self, downloader=None):
  70         """Constructor. Receives an optional downloader."""
  71         self._ready = False
  72         self.set_downloader(downloader)
  73
  74     def suitable(self, url):
  75         """Receives a URL and returns True if suitable for this IE."""
  76         return re.match(self._VALID_URL, url) is not None
  77
  78     def working(self):
  79         """Getter method for _WORKING."""
  80         return self._WORKING
  81
  82     def initialize(self):
  83         """Initializes an instance (authentication, etc)."""
  84         if not self._ready:
  85             self._real_initialize()
  86             self._ready = True
  87
  88     def extract(self, url):
  89         """Extracts URL information and returns it in list of dicts."""
  90         self.initialize()
  91         return self._real_extract(url)
  92
  93     def set_downloader(self, downloader):
  94         """Sets the downloader for this IE."""
  95         self._downloader = downloader
  96
  97     def _real_initialize(self):
  98         """Real initialization process. Redefine in subclasses."""
  99         pass
 100
 101     def _real_extract(self, url):
 102         """Real extraction process. Redefine in subclasses."""
 103         pass
 104
 105     @property
 106     def IE_NAME(self):
 107         return type(self).__name__[:-2]
 108
 109     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 110         """ Returns the response handle """
 111         if note is None:
 112             note = u'Downloading video webpage'
 113         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 114         try:
 115             return compat_urllib_request.urlopen(url_or_request)
 116         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 117             if errnote is None:
 118                 errnote = u'Unable to download webpage'
 119             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 120
 121     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 122         """ Returns the data of the page as a string """
 123         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 124         webpage_bytes = urlh.read()
 125         return webpage_bytes.decode('utf-8', 'replace')
 126
 127
 128 class YoutubeIE(InfoExtractor):
 129     """Information extractor for youtube.com."""
 130
 131     _VALID_URL = r"""^
 132                      (
 133                          (?:https?://)?                                       # http(s):// (optional)
 134                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 135                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 136                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 137                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 138                          (?:                                                  # the various things that can precede the ID:
 139                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 140                              |(?:                                             # or the v= param in all its forms
 141                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 142                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 143                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 144                                  v=
 145                              )
 146                          )?                                                   # optional -> youtube.com/xxxx is OK
 147                      )?                                                       # all until now is optional -> you can pass the naked ID
 148                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 149                      (?(1).+)?                                                # if we found the ID, everything can follow
 150                      $"""
 151     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 152     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 153     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 154     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 155     _NETRC_MACHINE = 'youtube'
 156     # Listed in order of quality
 157     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 158     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 159     _video_extensions = {
 160         '13': '3gp',
 161         '17': 'mp4',
 162         '18': 'mp4',
 163         '22': 'mp4',
 164         '37': 'mp4',
 165         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 166         '43': 'webm',
 167         '44': 'webm',
 168         '45': 'webm',
 169         '46': 'webm',
 170     }
 171     _video_dimensions = {
 172         '5': '240x400',
 173         '6': '???',
 174         '13': '???',
 175         '17': '144x176',
 176         '18': '360x640',
 177         '22': '720x1280',
 178         '34': '360x640',
 179         '35': '480x854',
 180         '37': '1080x1920',
 181         '38': '3072x4096',
 182         '43': '360x640',
 183         '44': '480x854',
 184         '45': '720x1280',
 185         '46': '1080x1920',
 186     }
 187     IE_NAME = u'youtube'
 188
 189     def suitable(self, url):
 190         """Receives a URL and returns True if suitable for this IE."""
 191         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 192
 193     def report_lang(self):
 194         """Report attempt to set language."""
 195         self._downloader.to_screen(u'[youtube] Setting language')
 196
 197     def report_login(self):
 198         """Report attempt to log in."""
 199         self._downloader.to_screen(u'[youtube] Logging in')
 200
 201     def report_age_confirmation(self):
 202         """Report attempt to confirm age."""
 203         self._downloader.to_screen(u'[youtube] Confirming age')
 204
 205     def report_video_webpage_download(self, video_id):
 206         """Report attempt to download video webpage."""
 207         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 208
 209     def report_video_info_webpage_download(self, video_id):
 210         """Report attempt to download video info webpage."""
 211         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 212
 213     def report_video_subtitles_download(self, video_id):
 214         """Report attempt to download video info webpage."""
 215         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 216
 217     def report_information_extraction(self, video_id):
 218         """Report attempt to extract video information."""
 219         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 220
 221     def report_unavailable_format(self, video_id, format):
 222         """Report extracted video URL."""
 223         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 224
 225     def report_rtmp_download(self):
 226         """Indicate the download will use the RTMP protocol."""
 227         self._downloader.to_screen(u'[youtube] RTMP download detected')
 228
 229     def _closed_captions_xml_to_srt(self, xml_string):
 230         srt = ''
 231         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 232         # TODO parse xml instead of regex
 233         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 234             if not dur: dur = '4'
 235             start = float(start)
 236             end = start + float(dur)
 237             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 238             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 239             caption = unescapeHTML(caption)
 240             caption = unescapeHTML(caption) # double cycle, intentional
 241             srt += str(n+1) + '\n'
 242             srt += start + ' --> ' + end + '\n'
 243             srt += caption + '\n\n'
 244         return srt
 245
 246     def _extract_subtitles(self, video_id):
 247         self.report_video_subtitles_download(video_id)
 248         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 249         try:
 250             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 251         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 252             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 253         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 254         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 255         if not srt_lang_list:
 256             return (u'WARNING: video has no closed captions', None)
 257         if self._downloader.params.get('subtitleslang', False):
 258             srt_lang = self._downloader.params.get('subtitleslang')
 259         elif 'en' in srt_lang_list:
 260             srt_lang = 'en'
 261         else:
 262             srt_lang = list(srt_lang_list.keys())[0]
 263         if not srt_lang in srt_lang_list:
 264             return (u'WARNING: no closed captions found in the specified language', None)
 265         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 266         try:
 267             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 268         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 269             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 270         if not srt_xml:
 271             return (u'WARNING: unable to download video subtitles', None)
 272         return (None, self._closed_captions_xml_to_srt(srt_xml))
 273
 274     def _print_formats(self, formats):
 275         print('Available formats:')
 276         for x in formats:
 277             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 278
 279     def _real_initialize(self):
 280         if self._downloader is None:
 281             return
 282
 283         username = None
 284         password = None
 285         downloader_params = self._downloader.params
 286
 287         # Attempt to use provided username and password or .netrc data
 288         if downloader_params.get('username', None) is not None:
 289             username = downloader_params['username']
 290             password = downloader_params['password']
 291         elif downloader_params.get('usenetrc', False):
 292             try:
 293                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 294                 if info is not None:
 295                     username = info[0]
 296                     password = info[2]
 297                 else:
 298                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 299             except (IOError, netrc.NetrcParseError) as err:
 300                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 301                 return
 302
 303         # Set language
 304         request = compat_urllib_request.Request(self._LANG_URL)
 305         try:
 306             self.report_lang()
 307             compat_urllib_request.urlopen(request).read()
 308         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 309             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 310             return
 311
 312         # No authentication to be performed
 313         if username is None:
 314             return
 315
 316         # Log in
 317         login_form = {
 318                 'current_form': 'loginForm',
 319                 'next':     '/',
 320                 'action_login': 'Log In',
 321                 'username': username,
 322                 'password': password,
 323                 }
 324         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 325         try:
 326             self.report_login()
 327             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 328             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 329                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 330                 return
 331         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 332             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 333             return
 334
 335         # Confirm age
 336         age_form = {
 337                 'next_url':     '/',
 338                 'action_confirm':   'Confirm',
 339                 }
 340         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 341         try:
 342             self.report_age_confirmation()
 343             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 344         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 345             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 346             return
 347
 348     def _extract_id(self, url):
 349         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 350         if mobj is None:
 351             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 352             return
 353         video_id = mobj.group(2)
 354         return video_id
 355
 356     def _real_extract(self, url):
 357         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 358         mobj = re.search(self._NEXT_URL_RE, url)
 359         if mobj:
 360             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 361         video_id = self._extract_id(url)
 362
 363         # Get video webpage
 364         self.report_video_webpage_download(video_id)
 365         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 366         request = compat_urllib_request.Request(url)
 367         try:
 368             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 369         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 370             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 371             return
 372
 373         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 374
 375         # Attempt to extract SWF player URL
 376         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 377         if mobj is not None:
 378             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 379         else:
 380             player_url = None
 381
 382         # Get video info
 383         self.report_video_info_webpage_download(video_id)
 384         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 385             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 386                     % (video_id, el_type))
 387             request = compat_urllib_request.Request(video_info_url)
 388             try:
 389                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 390                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 391                 video_info = compat_parse_qs(video_info_webpage)
 392                 if 'token' in video_info:
 393                     break
 394             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 395                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 396                 return
 397         if 'token' not in video_info:
 398             if 'reason' in video_info:
 399                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 400             else:
 401                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 402             return
 403
 404         # Check for "rental" videos
 405         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 406             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 407             return
 408
 409         # Start extracting information
 410         self.report_information_extraction(video_id)
 411
 412         # uploader
 413         if 'author' not in video_info:
 414             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 415             return
 416         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 417
 418         # uploader_id
 419         video_uploader_id = None
 420         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 421         if mobj is not None:
 422             video_uploader_id = mobj.group(1)
 423         else:
 424             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 425
 426         # title
 427         if 'title' not in video_info:
 428             self._downloader.trouble(u'ERROR: unable to extract video title')
 429             return
 430         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 431
 432         # thumbnail image
 433         if 'thumbnail_url' not in video_info:
 434             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 435             video_thumbnail = ''
 436         else:   # don't panic if we can't find it
 437             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 438
 439         # upload date
 440         upload_date = None
 441         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 442         if mobj is not None:
 443             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 444             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 445             for expression in format_expressions:
 446                 try:
 447                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 448                 except:
 449                     pass
 450
 451         # description
 452         video_description = get_element_by_id("eow-description", video_webpage)
 453         if video_description:
 454             video_description = clean_html(video_description)
 455         else:
 456             video_description = ''
 457
 458         # closed captions
 459         video_subtitles = None
 460         if self._downloader.params.get('writesubtitles', False):
 461             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 462             if srt_error:
 463                 self._downloader.trouble(srt_error)
 464
 465         if 'length_seconds' not in video_info:
 466             self._downloader.trouble(u'WARNING: unable to extract video duration')
 467             video_duration = ''
 468         else:
 469             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 470
 471         # token
 472         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 473
 474         # Decide which formats to download
 475         req_format = self._downloader.params.get('format', None)
 476
 477         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 478             self.report_rtmp_download()
 479             video_url_list = [(None, video_info['conn'][0])]
 480         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 481             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 482             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 483             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 484             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 485
 486             format_limit = self._downloader.params.get('format_limit', None)
 487             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 488             if format_limit is not None and format_limit in available_formats:
 489                 format_list = available_formats[available_formats.index(format_limit):]
 490             else:
 491                 format_list = available_formats
 492             existing_formats = [x for x in format_list if x in url_map]
 493             if len(existing_formats) == 0:
 494                 self._downloader.trouble(u'ERROR: no known formats available for video')
 495                 return
 496             if self._downloader.params.get('listformats', None):
 497                 self._print_formats(existing_formats)
 498                 return
 499             if req_format is None or req_format == 'best':
 500                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 501             elif req_format == 'worst':
 502                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 503             elif req_format in ('-1', 'all'):
 504                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 505             else:
 506                 # Specific formats. We pick the first in a slash-delimeted sequence.
 507                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 508                 req_formats = req_format.split('/')
 509                 video_url_list = None
 510                 for rf in req_formats:
 511                     if rf in url_map:
 512                         video_url_list = [(rf, url_map[rf])]
 513                         break
 514                 if video_url_list is None:
 515                     self._downloader.trouble(u'ERROR: requested format not available')
 516                     return
 517         else:
 518             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 519             return
 520
 521         results = []
 522         for format_param, video_real_url in video_url_list:
 523             # Extension
 524             video_extension = self._video_extensions.get(format_param, 'flv')
 525
 526             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 527                                               self._video_dimensions.get(format_param, '???'))
 528
 529             results.append({
 530                 'id':       video_id,
 531                 'url':      video_real_url,
 532                 'uploader': video_uploader,
 533                 'uploader_id': video_uploader_id,
 534                 'upload_date':  upload_date,
 535                 'title':    video_title,
 536                 'ext':      video_extension,
 537                 'format':   video_format,
 538                 'thumbnail':    video_thumbnail,
 539                 'description':  video_description,
 540                 'player_url':   player_url,
 541                 'subtitles':    video_subtitles,
 542                 'duration':     video_duration
 543             })
 544         return results
 545
 546
 547 class MetacafeIE(InfoExtractor):
 548     """Information Extractor for metacafe.com."""
 549
 550     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 551     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 552     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 553     IE_NAME = u'metacafe'
 554
 555     def __init__(self, downloader=None):
 556         InfoExtractor.__init__(self, downloader)
 557
 558     def report_disclaimer(self):
 559         """Report disclaimer retrieval."""
 560         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 561
 562     def report_age_confirmation(self):
 563         """Report attempt to confirm age."""
 564         self._downloader.to_screen(u'[metacafe] Confirming age')
 565
 566     def report_download_webpage(self, video_id):
 567         """Report webpage download."""
 568         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 569
 570     def report_extraction(self, video_id):
 571         """Report information extraction."""
 572         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 573
 574     def _real_initialize(self):
 575         # Retrieve disclaimer
 576         request = compat_urllib_request.Request(self._DISCLAIMER)
 577         try:
 578             self.report_disclaimer()
 579             disclaimer = compat_urllib_request.urlopen(request).read()
 580         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 581             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 582             return
 583
 584         # Confirm age
 585         disclaimer_form = {
 586             'filters': '0',
 587             'submit': "Continue - I'm over 18",
 588             }
 589         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 590         try:
 591             self.report_age_confirmation()
 592             disclaimer = compat_urllib_request.urlopen(request).read()
 593         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 594             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 595             return
 596
 597     def _real_extract(self, url):
 598         # Extract id and simplified title from URL
 599         mobj = re.match(self._VALID_URL, url)
 600         if mobj is None:
 601             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 602             return
 603
 604         video_id = mobj.group(1)
 605
 606         # Check if video comes from YouTube
 607         mobj2 = re.match(r'^yt-(.*)$', video_id)
 608         if mobj2 is not None:
 609             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 610             return
 611
 612         # Retrieve video webpage to extract further information
 613         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 614         try:
 615             self.report_download_webpage(video_id)
 616             webpage = compat_urllib_request.urlopen(request).read()
 617         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 618             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 619             return
 620
 621         # Extract URL, uploader and title from webpage
 622         self.report_extraction(video_id)
 623         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 624         if mobj is not None:
 625             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 626             video_extension = mediaURL[-3:]
 627
 628             # Extract gdaKey if available
 629             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 630             if mobj is None:
 631                 video_url = mediaURL
 632             else:
 633                 gdaKey = mobj.group(1)
 634                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 635         else:
 636             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 637             if mobj is None:
 638                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 639                 return
 640             vardict = compat_parse_qs(mobj.group(1))
 641             if 'mediaData' not in vardict:
 642                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 643                 return
 644             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 645             if mobj is None:
 646                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 647                 return
 648             mediaURL = mobj.group(1).replace('\\/', '/')
 649             video_extension = mediaURL[-3:]
 650             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 651
 652         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 653         if mobj is None:
 654             self._downloader.trouble(u'ERROR: unable to extract title')
 655             return
 656         video_title = mobj.group(1).decode('utf-8')
 657
 658         mobj = re.search(r'submitter=(.*?);', webpage)
 659         if mobj is None:
 660             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 661             return
 662         video_uploader = mobj.group(1)
 663
 664         return [{
 665             'id':       video_id.decode('utf-8'),
 666             'url':      video_url.decode('utf-8'),
 667             'uploader': video_uploader.decode('utf-8'),
 668             'upload_date':  None,
 669             'title':    video_title,
 670             'ext':      video_extension.decode('utf-8'),
 671         }]
 672
 673
 674 class DailymotionIE(InfoExtractor):
 675     """Information Extractor for Dailymotion"""
 676
 677     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 678     IE_NAME = u'dailymotion'
 679
 680     def __init__(self, downloader=None):
 681         InfoExtractor.__init__(self, downloader)
 682
 683     def report_extraction(self, video_id):
 684         """Report information extraction."""
 685         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 686
 687     def _real_extract(self, url):
 688         # Extract id and simplified title from URL
 689         mobj = re.match(self._VALID_URL, url)
 690         if mobj is None:
 691             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 692             return
 693
 694         video_id = mobj.group(1).split('_')[0].split('?')[0]
 695
 696         video_extension = 'mp4'
 697
 698         # Retrieve video webpage to extract further information
 699         request = compat_urllib_request.Request(url)
 700         request.add_header('Cookie', 'family_filter=off')
 701         webpage = self._download_webpage(request, video_id)
 702
 703         # Extract URL, uploader and title from webpage
 704         self.report_extraction(video_id)
 705         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 706         if mobj is None:
 707             self._downloader.trouble(u'ERROR: unable to extract media URL')
 708             return
 709         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 710
 711         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 712             if key in flashvars:
 713                 max_quality = key
 714                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 715                 break
 716         else:
 717             self._downloader.trouble(u'ERROR: unable to extract video URL')
 718             return
 719
 720         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 721         if mobj is None:
 722             self._downloader.trouble(u'ERROR: unable to extract video URL')
 723             return
 724
 725         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 726
 727         # TODO: support choosing qualities
 728
 729         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 730         if mobj is None:
 731             self._downloader.trouble(u'ERROR: unable to extract title')
 732             return
 733         video_title = unescapeHTML(mobj.group('title'))
 734
 735         video_uploader = None
 736         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 737         if mobj is None:
 738             # lookin for official user
 739             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 740             if mobj_official is None:
 741                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 742             else:
 743                 video_uploader = mobj_official.group(1)
 744         else:
 745             video_uploader = mobj.group(1)
 746
 747         video_upload_date = None
 748         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 749         if mobj is not None:
 750             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 751
 752         return [{
 753             'id':       video_id,
 754             'url':      video_url,
 755             'uploader': video_uploader,
 756             'upload_date':  video_upload_date,
 757             'title':    video_title,
 758             'ext':      video_extension,
 759         }]
 760
 761
 762 class PhotobucketIE(InfoExtractor):
 763     """Information extractor for photobucket.com."""
 764
 765     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 766     IE_NAME = u'photobucket'
 767
 768     def __init__(self, downloader=None):
 769         InfoExtractor.__init__(self, downloader)
 770
 771     def report_download_webpage(self, video_id):
 772         """Report webpage download."""
 773         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 774
 775     def report_extraction(self, video_id):
 776         """Report information extraction."""
 777         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 778
 779     def _real_extract(self, url):
 780         # Extract id from URL
 781         mobj = re.match(self._VALID_URL, url)
 782         if mobj is None:
 783             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 784             return
 785
 786         video_id = mobj.group(1)
 787
 788         video_extension = 'flv'
 789
 790         # Retrieve video webpage to extract further information
 791         request = compat_urllib_request.Request(url)
 792         try:
 793             self.report_download_webpage(video_id)
 794             webpage = compat_urllib_request.urlopen(request).read()
 795         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 796             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 797             return
 798
 799         # Extract URL, uploader, and title from webpage
 800         self.report_extraction(video_id)
 801         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 802         if mobj is None:
 803             self._downloader.trouble(u'ERROR: unable to extract media URL')
 804             return
 805         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 806
 807         video_url = mediaURL
 808
 809         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 810         if mobj is None:
 811             self._downloader.trouble(u'ERROR: unable to extract title')
 812             return
 813         video_title = mobj.group(1).decode('utf-8')
 814
 815         video_uploader = mobj.group(2).decode('utf-8')
 816
 817         return [{
 818             'id':       video_id.decode('utf-8'),
 819             'url':      video_url.decode('utf-8'),
 820             'uploader': video_uploader,
 821             'upload_date':  None,
 822             'title':    video_title,
 823             'ext':      video_extension.decode('utf-8'),
 824         }]
 825
 826
 827 class YahooIE(InfoExtractor):
 828     """Information extractor for video.yahoo.com."""
 829
 830     _WORKING = False
 831     # _VALID_URL matches all Yahoo! Video URLs
 832     # _VPAGE_URL matches only the extractable '/watch/' URLs
 833     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 834     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 835     IE_NAME = u'video.yahoo'
 836
 837     def __init__(self, downloader=None):
 838         InfoExtractor.__init__(self, downloader)
 839
 840     def report_download_webpage(self, video_id):
 841         """Report webpage download."""
 842         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 843
 844     def report_extraction(self, video_id):
 845         """Report information extraction."""
 846         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 847
 848     def _real_extract(self, url, new_video=True):
 849         # Extract ID from URL
 850         mobj = re.match(self._VALID_URL, url)
 851         if mobj is None:
 852             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 853             return
 854
 855         video_id = mobj.group(2)
 856         video_extension = 'flv'
 857
 858         # Rewrite valid but non-extractable URLs as
 859         # extractable English language /watch/ URLs
 860         if re.match(self._VPAGE_URL, url) is None:
 861             request = compat_urllib_request.Request(url)
 862             try:
 863                 webpage = compat_urllib_request.urlopen(request).read()
 864             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 865                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 866                 return
 867
 868             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 869             if mobj is None:
 870                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 871                 return
 872             yahoo_id = mobj.group(1)
 873
 874             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 875             if mobj is None:
 876                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 877                 return
 878             yahoo_vid = mobj.group(1)
 879
 880             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 881             return self._real_extract(url, new_video=False)
 882
 883         # Retrieve video webpage to extract further information
 884         request = compat_urllib_request.Request(url)
 885         try:
 886             self.report_download_webpage(video_id)
 887             webpage = compat_urllib_request.urlopen(request).read()
 888         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 889             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 890             return
 891
 892         # Extract uploader and title from webpage
 893         self.report_extraction(video_id)
 894         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 895         if mobj is None:
 896             self._downloader.trouble(u'ERROR: unable to extract video title')
 897             return
 898         video_title = mobj.group(1).decode('utf-8')
 899
 900         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 901         if mobj is None:
 902             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 903             return
 904         video_uploader = mobj.group(1).decode('utf-8')
 905
 906         # Extract video thumbnail
 907         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 908         if mobj is None:
 909             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 910             return
 911         video_thumbnail = mobj.group(1).decode('utf-8')
 912
 913         # Extract video description
 914         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 915         if mobj is None:
 916             self._downloader.trouble(u'ERROR: unable to extract video description')
 917             return
 918         video_description = mobj.group(1).decode('utf-8')
 919         if not video_description:
 920             video_description = 'No description available.'
 921
 922         # Extract video height and width
 923         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 924         if mobj is None:
 925             self._downloader.trouble(u'ERROR: unable to extract video height')
 926             return
 927         yv_video_height = mobj.group(1)
 928
 929         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 930         if mobj is None:
 931             self._downloader.trouble(u'ERROR: unable to extract video width')
 932             return
 933         yv_video_width = mobj.group(1)
 934
 935         # Retrieve video playlist to extract media URL
 936         # I'm not completely sure what all these options are, but we
 937         # seem to need most of them, otherwise the server sends a 401.
 938         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 939         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 940         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 941                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 942                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 943         try:
 944             self.report_download_webpage(video_id)
 945             webpage = compat_urllib_request.urlopen(request).read()
 946         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 947             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 948             return
 949
 950         # Extract media URL from playlist XML
 951         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 952         if mobj is None:
 953             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 954             return
 955         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 956         video_url = unescapeHTML(video_url)
 957
 958         return [{
 959             'id':       video_id.decode('utf-8'),
 960             'url':      video_url,
 961             'uploader': video_uploader,
 962             'upload_date':  None,
 963             'title':    video_title,
 964             'ext':      video_extension.decode('utf-8'),
 965             'thumbnail':    video_thumbnail.decode('utf-8'),
 966             'description':  video_description,
 967         }]
 968
 969
 970 class VimeoIE(InfoExtractor):
 971     """Information extractor for vimeo.com."""
 972
 973     # _VALID_URL matches Vimeo URLs
 974     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 975     IE_NAME = u'vimeo'
 976
 977     def __init__(self, downloader=None):
 978         InfoExtractor.__init__(self, downloader)
 979
 980     def report_download_webpage(self, video_id):
 981         """Report webpage download."""
 982         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 983
 984     def report_extraction(self, video_id):
 985         """Report information extraction."""
 986         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 987
 988     def _real_extract(self, url, new_video=True):
 989         # Extract ID from URL
 990         mobj = re.match(self._VALID_URL, url)
 991         if mobj is None:
 992             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 993             return
 994
 995         video_id = mobj.group(1)
 996
 997         # Retrieve video webpage to extract further information
 998         request = compat_urllib_request.Request(url, None, std_headers)
 999         try:
1000             self.report_download_webpage(video_id)
1001             webpage_bytes = compat_urllib_request.urlopen(request).read()
1002             webpage = webpage_bytes.decode('utf-8')
1003         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1004             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1005             return
1006
1007         # Now we begin extracting as much information as we can from what we
1008         # retrieved. First we extract the information common to all extractors,
1009         # and latter we extract those that are Vimeo specific.
1010         self.report_extraction(video_id)
1011
1012         # Extract the config JSON
1013         try:
1014             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1015             config = json.loads(config)
1016         except:
1017             self._downloader.trouble(u'ERROR: unable to extract info section')
1018             return
1019
1020         # Extract title
1021         video_title = config["video"]["title"]
1022
1023         # Extract uploader and uploader_id
1024         video_uploader = config["video"]["owner"]["name"]
1025         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1026
1027         # Extract video thumbnail
1028         video_thumbnail = config["video"]["thumbnail"]
1029
1030         # Extract video description
1031         video_description = get_element_by_attribute("itemprop", "description", webpage)
1032         if video_description: video_description = clean_html(video_description)
1033         else: video_description = ''
1034
1035         # Extract upload date
1036         video_upload_date = None
1037         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1038         if mobj is not None:
1039             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1040
1041         # Vimeo specific: extract request signature and timestamp
1042         sig = config['request']['signature']
1043         timestamp = config['request']['timestamp']
1044
1045         # Vimeo specific: extract video codec and quality information
1046         # First consider quality, then codecs, then take everything
1047         # TODO bind to format param
1048         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1049         files = { 'hd': [], 'sd': [], 'other': []}
1050         for codec_name, codec_extension in codecs:
1051             if codec_name in config["video"]["files"]:
1052                 if 'hd' in config["video"]["files"][codec_name]:
1053                     files['hd'].append((codec_name, codec_extension, 'hd'))
1054                 elif 'sd' in config["video"]["files"][codec_name]:
1055                     files['sd'].append((codec_name, codec_extension, 'sd'))
1056                 else:
1057                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1058
1059         for quality in ('hd', 'sd', 'other'):
1060             if len(files[quality]) > 0:
1061                 video_quality = files[quality][0][2]
1062                 video_codec = files[quality][0][0]
1063                 video_extension = files[quality][0][1]
1064                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1065                 break
1066         else:
1067             self._downloader.trouble(u'ERROR: no known codec found')
1068             return
1069
1070         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1071                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1072
1073         return [{
1074             'id':       video_id,
1075             'url':      video_url,
1076             'uploader': video_uploader,
1077             'uploader_id': video_uploader_id,
1078             'upload_date':  video_upload_date,
1079             'title':    video_title,
1080             'ext':      video_extension,
1081             'thumbnail':    video_thumbnail,
1082             'description':  video_description,
1083         }]
1084
1085
1086 class ArteTvIE(InfoExtractor):
1087     """arte.tv information extractor."""
1088
1089     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1090     _LIVE_URL = r'index-[0-9]+\.html$'
1091
1092     IE_NAME = u'arte.tv'
1093
1094     def __init__(self, downloader=None):
1095         InfoExtractor.__init__(self, downloader)
1096
1097     def report_download_webpage(self, video_id):
1098         """Report webpage download."""
1099         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1100
1101     def report_extraction(self, video_id):
1102         """Report information extraction."""
1103         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1104
1105     def fetch_webpage(self, url):
1106         request = compat_urllib_request.Request(url)
1107         try:
1108             self.report_download_webpage(url)
1109             webpage = compat_urllib_request.urlopen(request).read()
1110         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1111             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1112             return
1113         except ValueError as err:
1114             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1115             return
1116         return webpage
1117
1118     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1119         page = self.fetch_webpage(url)
1120         mobj = re.search(regex, page, regexFlags)
1121         info = {}
1122
1123         if mobj is None:
1124             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1125             return
1126
1127         for (i, key, err) in matchTuples:
1128             if mobj.group(i) is None:
1129                 self._downloader.trouble(err)
1130                 return
1131             else:
1132                 info[key] = mobj.group(i)
1133
1134         return info
1135
1136     def extractLiveStream(self, url):
1137         video_lang = url.split('/')[-4]
1138         info = self.grep_webpage(
1139             url,
1140             r'src="(.*?/videothek_js.*?\.js)',
1141             0,
1142             [
1143                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1144             ]
1145         )
1146         http_host = url.split('/')[2]
1147         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1148         info = self.grep_webpage(
1149             next_url,
1150             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1151                 '(http://.*?\.swf).*?' +
1152                 '(rtmp://.*?)\'',
1153             re.DOTALL,
1154             [
1155                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1156                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1157                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1158             ]
1159         )
1160         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1161
1162     def extractPlus7Stream(self, url):
1163         video_lang = url.split('/')[-3]
1164         info = self.grep_webpage(
1165             url,
1166             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1167             0,
1168             [
1169                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1170             ]
1171         )
1172         next_url = compat_urllib_parse.unquote(info.get('url'))
1173         info = self.grep_webpage(
1174             next_url,
1175             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1176             0,
1177             [
1178                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1179             ]
1180         )
1181         next_url = compat_urllib_parse.unquote(info.get('url'))
1182
1183         info = self.grep_webpage(
1184             next_url,
1185             r'<video id="(.*?)".*?>.*?' +
1186                 '<name>(.*?)</name>.*?' +
1187                 '<dateVideo>(.*?)</dateVideo>.*?' +
1188                 '<url quality="hd">(.*?)</url>',
1189             re.DOTALL,
1190             [
1191                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1192                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1193                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1194                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1195             ]
1196         )
1197
1198         return {
1199             'id':           info.get('id'),
1200             'url':          compat_urllib_parse.unquote(info.get('url')),
1201             'uploader':     u'arte.tv',
1202             'upload_date':  info.get('date'),
1203             'title':        info.get('title').decode('utf-8'),
1204             'ext':          u'mp4',
1205             'format':       u'NA',
1206             'player_url':   None,
1207         }
1208
1209     def _real_extract(self, url):
1210         video_id = url.split('/')[-1]
1211         self.report_extraction(video_id)
1212
1213         if re.search(self._LIVE_URL, video_id) is not None:
1214             self.extractLiveStream(url)
1215             return
1216         else:
1217             info = self.extractPlus7Stream(url)
1218
1219         return [info]
1220
1221
1222 class GenericIE(InfoExtractor):
1223     """Generic last-resort information extractor."""
1224
1225     _VALID_URL = r'.*'
1226     IE_NAME = u'generic'
1227
1228     def __init__(self, downloader=None):
1229         InfoExtractor.__init__(self, downloader)
1230
1231     def report_download_webpage(self, video_id):
1232         """Report webpage download."""
1233         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1234         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1235
1236     def report_extraction(self, video_id):
1237         """Report information extraction."""
1238         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1239
1240     def report_following_redirect(self, new_url):
1241         """Report information extraction."""
1242         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1243
1244     def _test_redirect(self, url):
1245         """Check if it is a redirect, like url shorteners, in case restart chain."""
1246         class HeadRequest(compat_urllib_request.Request):
1247             def get_method(self):
1248                 return "HEAD"
1249
1250         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1251             """
1252             Subclass the HTTPRedirectHandler to make it use our
1253             HeadRequest also on the redirected URL
1254             """
1255             def redirect_request(self, req, fp, code, msg, headers, newurl):
1256                 if code in (301, 302, 303, 307):
1257                     newurl = newurl.replace(' ', '%20')
1258                     newheaders = dict((k,v) for k,v in req.headers.items()
1259                                       if k.lower() not in ("content-length", "content-type"))
1260                     return HeadRequest(newurl,
1261                                        headers=newheaders,
1262                                        origin_req_host=req.get_origin_req_host(),
1263                                        unverifiable=True)
1264                 else:
1265                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1266
1267         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1268             """
1269             Fallback to GET if HEAD is not allowed (405 HTTP error)
1270             """
1271             def http_error_405(self, req, fp, code, msg, headers):
1272                 fp.read()
1273                 fp.close()
1274
1275                 newheaders = dict((k,v) for k,v in req.headers.items()
1276                                   if k.lower() not in ("content-length", "content-type"))
1277                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1278                                                  headers=newheaders,
1279                                                  origin_req_host=req.get_origin_req_host(),
1280                                                  unverifiable=True))
1281
1282         # Build our opener
1283         opener = compat_urllib_request.OpenerDirector()
1284         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1285                         HTTPMethodFallback, HEADRedirectHandler,
1286                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1287             opener.add_handler(handler())
1288
1289         response = opener.open(HeadRequest(url))
1290         new_url = response.geturl()
1291
1292         if url == new_url:
1293             return False
1294
1295         self.report_following_redirect(new_url)
1296         self._downloader.download([new_url])
1297         return True
1298
1299     def _real_extract(self, url):
1300         if self._test_redirect(url): return
1301
1302         video_id = url.split('/')[-1]
1303         request = compat_urllib_request.Request(url)
1304         try:
1305             self.report_download_webpage(video_id)
1306             webpage = compat_urllib_request.urlopen(request).read()
1307         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1308             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1309             return
1310         except ValueError as err:
1311             # since this is the last-resort InfoExtractor, if
1312             # this error is thrown, it'll be thrown here
1313             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1314             return
1315
1316         self.report_extraction(video_id)
1317         # Start with something easy: JW Player in SWFObject
1318         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1319         if mobj is None:
1320             # Broaden the search a little bit
1321             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1322         if mobj is None:
1323             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1324             return
1325
1326         # It's possible that one of the regexes
1327         # matched, but returned an empty group:
1328         if mobj.group(1) is None:
1329             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1330             return
1331
1332         video_url = compat_urllib_parse.unquote(mobj.group(1))
1333         video_id = os.path.basename(video_url)
1334
1335         # here's a fun little line of code for you:
1336         video_extension = os.path.splitext(video_id)[1][1:]
1337         video_id = os.path.splitext(video_id)[0]
1338
1339         # it's tempting to parse this further, but you would
1340         # have to take into account all the variations like
1341         #   Video Title - Site Name
1342         #   Site Name | Video Title
1343         #   Video Title - Tagline | Site Name
1344         # and so on and so forth; it's just not practical
1345         mobj = re.search(r'<title>(.*)</title>', webpage)
1346         if mobj is None:
1347             self._downloader.trouble(u'ERROR: unable to extract title')
1348             return
1349         video_title = mobj.group(1)
1350
1351         # video uploader is domain name
1352         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1353         if mobj is None:
1354             self._downloader.trouble(u'ERROR: unable to extract title')
1355             return
1356         video_uploader = mobj.group(1)
1357
1358         return [{
1359             'id':       video_id,
1360             'url':      video_url,
1361             'uploader': video_uploader,
1362             'upload_date':  None,
1363             'title':    video_title,
1364             'ext':      video_extension,
1365         }]
1366
1367
1368 class YoutubeSearchIE(InfoExtractor):
1369     """Information Extractor for YouTube search queries."""
1370     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1371     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1372     _max_youtube_results = 1000
1373     IE_NAME = u'youtube:search'
1374
1375     def __init__(self, downloader=None):
1376         InfoExtractor.__init__(self, downloader)
1377
1378     def report_download_page(self, query, pagenum):
1379         """Report attempt to download search page with given number."""
1380         query = query.decode(preferredencoding())
1381         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1382
1383     def _real_extract(self, query):
1384         mobj = re.match(self._VALID_URL, query)
1385         if mobj is None:
1386             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1387             return
1388
1389         prefix, query = query.split(':')
1390         prefix = prefix[8:]
1391         query = query.encode('utf-8')
1392         if prefix == '':
1393             self._download_n_results(query, 1)
1394             return
1395         elif prefix == 'all':
1396             self._download_n_results(query, self._max_youtube_results)
1397             return
1398         else:
1399             try:
1400                 n = int(prefix)
1401                 if n <= 0:
1402                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1403                     return
1404                 elif n > self._max_youtube_results:
1405                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1406                     n = self._max_youtube_results
1407                 self._download_n_results(query, n)
1408                 return
1409             except ValueError: # parsing prefix as integer fails
1410                 self._download_n_results(query, 1)
1411                 return
1412
1413     def _download_n_results(self, query, n):
1414         """Downloads a specified number of results for a query"""
1415
1416         video_ids = []
1417         pagenum = 0
1418         limit = n
1419
1420         while (50 * pagenum) < limit:
1421             self.report_download_page(query, pagenum+1)
1422             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1423             request = compat_urllib_request.Request(result_url)
1424             try:
1425                 data = compat_urllib_request.urlopen(request).read()
1426             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1427                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1428                 return
1429             api_response = json.loads(data)['data']
1430
1431             new_ids = list(video['id'] for video in api_response['items'])
1432             video_ids += new_ids
1433
1434             limit = min(n, api_response['totalItems'])
1435             pagenum += 1
1436
1437         if len(video_ids) > n:
1438             video_ids = video_ids[:n]
1439         for id in video_ids:
1440             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1441         return
1442
1443
1444 class GoogleSearchIE(InfoExtractor):
1445     """Information Extractor for Google Video search queries."""
1446     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1447     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1448     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1449     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1450     _max_google_results = 1000
1451     IE_NAME = u'video.google:search'
1452
1453     def __init__(self, downloader=None):
1454         InfoExtractor.__init__(self, downloader)
1455
1456     def report_download_page(self, query, pagenum):
1457         """Report attempt to download playlist page with given number."""
1458         query = query.decode(preferredencoding())
1459         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1460
1461     def _real_extract(self, query):
1462         mobj = re.match(self._VALID_URL, query)
1463         if mobj is None:
1464             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1465             return
1466
1467         prefix, query = query.split(':')
1468         prefix = prefix[8:]
1469         query = query.encode('utf-8')
1470         if prefix == '':
1471             self._download_n_results(query, 1)
1472             return
1473         elif prefix == 'all':
1474             self._download_n_results(query, self._max_google_results)
1475             return
1476         else:
1477             try:
1478                 n = int(prefix)
1479                 if n <= 0:
1480                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1481                     return
1482                 elif n > self._max_google_results:
1483                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1484                     n = self._max_google_results
1485                 self._download_n_results(query, n)
1486                 return
1487             except ValueError: # parsing prefix as integer fails
1488                 self._download_n_results(query, 1)
1489                 return
1490
1491     def _download_n_results(self, query, n):
1492         """Downloads a specified number of results for a query"""
1493
1494         video_ids = []
1495         pagenum = 0
1496
1497         while True:
1498             self.report_download_page(query, pagenum)
1499             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1500             request = compat_urllib_request.Request(result_url)
1501             try:
1502                 page = compat_urllib_request.urlopen(request).read()
1503             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1504                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1505                 return
1506
1507             # Extract video identifiers
1508             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1509                 video_id = mobj.group(1)
1510                 if video_id not in video_ids:
1511                     video_ids.append(video_id)
1512                     if len(video_ids) == n:
1513                         # Specified n videos reached
1514                         for id in video_ids:
1515                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1516                         return
1517
1518             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1519                 for id in video_ids:
1520                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1521                 return
1522
1523             pagenum = pagenum + 1
1524
1525
1526 class YahooSearchIE(InfoExtractor):
1527     """Information Extractor for Yahoo! Video search queries."""
1528
1529     _WORKING = False
1530     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1531     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1532     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1533     _MORE_PAGES_INDICATOR = r'\s*Next'
1534     _max_yahoo_results = 1000
1535     IE_NAME = u'video.yahoo:search'
1536
1537     def __init__(self, downloader=None):
1538         InfoExtractor.__init__(self, downloader)
1539
1540     def report_download_page(self, query, pagenum):
1541         """Report attempt to download playlist page with given number."""
1542         query = query.decode(preferredencoding())
1543         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1544
1545     def _real_extract(self, query):
1546         mobj = re.match(self._VALID_URL, query)
1547         if mobj is None:
1548             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1549             return
1550
1551         prefix, query = query.split(':')
1552         prefix = prefix[8:]
1553         query = query.encode('utf-8')
1554         if prefix == '':
1555             self._download_n_results(query, 1)
1556             return
1557         elif prefix == 'all':
1558             self._download_n_results(query, self._max_yahoo_results)
1559             return
1560         else:
1561             try:
1562                 n = int(prefix)
1563                 if n <= 0:
1564                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1565                     return
1566                 elif n > self._max_yahoo_results:
1567                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1568                     n = self._max_yahoo_results
1569                 self._download_n_results(query, n)
1570                 return
1571             except ValueError: # parsing prefix as integer fails
1572                 self._download_n_results(query, 1)
1573                 return
1574
1575     def _download_n_results(self, query, n):
1576         """Downloads a specified number of results for a query"""
1577
1578         video_ids = []
1579         already_seen = set()
1580         pagenum = 1
1581
1582         while True:
1583             self.report_download_page(query, pagenum)
1584             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1585             request = compat_urllib_request.Request(result_url)
1586             try:
1587                 page = compat_urllib_request.urlopen(request).read()
1588             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1589                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1590                 return
1591
1592             # Extract video identifiers
1593             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1594                 video_id = mobj.group(1)
1595                 if video_id not in already_seen:
1596                     video_ids.append(video_id)
1597                     already_seen.add(video_id)
1598                     if len(video_ids) == n:
1599                         # Specified n videos reached
1600                         for id in video_ids:
1601                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1602                         return
1603
1604             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1605                 for id in video_ids:
1606                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1607                 return
1608
1609             pagenum = pagenum + 1
1610
1611
1612 class YoutubePlaylistIE(InfoExtractor):
1613     """Information Extractor for YouTube playlists."""
1614
1615     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1616     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1617     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1618     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1619     IE_NAME = u'youtube:playlist'
1620
1621     def __init__(self, downloader=None):
1622         InfoExtractor.__init__(self, downloader)
1623
1624     def report_download_page(self, playlist_id, pagenum):
1625         """Report attempt to download playlist page with given number."""
1626         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1627
1628     def _real_extract(self, url):
1629         # Extract playlist id
1630         mobj = re.match(self._VALID_URL, url)
1631         if mobj is None:
1632             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1633             return
1634
1635         # Single video case
1636         if mobj.group(3) is not None:
1637             self._downloader.download([mobj.group(3)])
1638             return
1639
1640         # Download playlist pages
1641         # prefix is 'p' as default for playlists but there are other types that need extra care
1642         playlist_prefix = mobj.group(1)
1643         if playlist_prefix == 'a':
1644             playlist_access = 'artist'
1645         else:
1646             playlist_prefix = 'p'
1647             playlist_access = 'view_play_list'
1648         playlist_id = mobj.group(2)
1649         video_ids = []
1650         pagenum = 1
1651
1652         while True:
1653             self.report_download_page(playlist_id, pagenum)
1654             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1655             request = compat_urllib_request.Request(url)
1656             try:
1657                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1658             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1659                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1660                 return
1661
1662             # Extract video identifiers
1663             ids_in_page = []
1664             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1665                 if mobj.group(1) not in ids_in_page:
1666                     ids_in_page.append(mobj.group(1))
1667             video_ids.extend(ids_in_page)
1668
1669             if self._MORE_PAGES_INDICATOR not in page:
1670                 break
1671             pagenum = pagenum + 1
1672
1673         total = len(video_ids)
1674
1675         playliststart = self._downloader.params.get('playliststart', 1) - 1
1676         playlistend = self._downloader.params.get('playlistend', -1)
1677         if playlistend == -1:
1678             video_ids = video_ids[playliststart:]
1679         else:
1680             video_ids = video_ids[playliststart:playlistend]
1681
1682         if len(video_ids) == total:
1683             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1684         else:
1685             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1686
1687         for id in video_ids:
1688             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1689         return
1690
1691
1692 class YoutubeChannelIE(InfoExtractor):
1693     """Information Extractor for YouTube channels."""
1694
1695     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1696     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1697     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1698     IE_NAME = u'youtube:channel'
1699
1700     def report_download_page(self, channel_id, pagenum):
1701         """Report attempt to download channel page with given number."""
1702         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1703
1704     def _real_extract(self, url):
1705         # Extract channel id
1706         mobj = re.match(self._VALID_URL, url)
1707         if mobj is None:
1708             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1709             return
1710
1711         # Download channel pages
1712         channel_id = mobj.group(1)
1713         video_ids = []
1714         pagenum = 1
1715
1716         while True:
1717             self.report_download_page(channel_id, pagenum)
1718             url = self._TEMPLATE_URL % (channel_id, pagenum)
1719             request = compat_urllib_request.Request(url)
1720             try:
1721                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1722             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1723                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1724                 return
1725
1726             # Extract video identifiers
1727             ids_in_page = []
1728             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1729                 if mobj.group(1) not in ids_in_page:
1730                     ids_in_page.append(mobj.group(1))
1731             video_ids.extend(ids_in_page)
1732
1733             if self._MORE_PAGES_INDICATOR not in page:
1734                 break
1735             pagenum = pagenum + 1
1736
1737         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1738
1739         for id in video_ids:
1740             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1741         return
1742
1743
1744 class YoutubeUserIE(InfoExtractor):
1745     """Information Extractor for YouTube users."""
1746
1747     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1748     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1749     _GDATA_PAGE_SIZE = 50
1750     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1751     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1752     IE_NAME = u'youtube:user'
1753
1754     def __init__(self, downloader=None):
1755         InfoExtractor.__init__(self, downloader)
1756
1757     def report_download_page(self, username, start_index):
1758         """Report attempt to download user page."""
1759         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1760                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1761
1762     def _real_extract(self, url):
1763         # Extract username
1764         mobj = re.match(self._VALID_URL, url)
1765         if mobj is None:
1766             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1767             return
1768
1769         username = mobj.group(1)
1770
1771         # Download video ids using YouTube Data API. Result size per
1772         # query is limited (currently to 50 videos) so we need to query
1773         # page by page until there are no video ids - it means we got
1774         # all of them.
1775
1776         video_ids = []
1777         pagenum = 0
1778
1779         while True:
1780             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1781             self.report_download_page(username, start_index)
1782
1783             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1784
1785             try:
1786                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1787             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1789                 return
1790
1791             # Extract video identifiers
1792             ids_in_page = []
1793
1794             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1795                 if mobj.group(1) not in ids_in_page:
1796                     ids_in_page.append(mobj.group(1))
1797
1798             video_ids.extend(ids_in_page)
1799
1800             # A little optimization - if current page is not
1801             # "full", ie. does not contain PAGE_SIZE video ids then
1802             # we can assume that this page is the last one - there
1803             # are no more ids on further pages - no need to query
1804             # again.
1805
1806             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1807                 break
1808
1809             pagenum += 1
1810
1811         all_ids_count = len(video_ids)
1812         playliststart = self._downloader.params.get('playliststart', 1) - 1
1813         playlistend = self._downloader.params.get('playlistend', -1)
1814
1815         if playlistend == -1:
1816             video_ids = video_ids[playliststart:]
1817         else:
1818             video_ids = video_ids[playliststart:playlistend]
1819
1820         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1821                 (username, all_ids_count, len(video_ids)))
1822
1823         for video_id in video_ids:
1824             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1825
1826
1827 class BlipTVUserIE(InfoExtractor):
1828     """Information Extractor for blip.tv users."""
1829
1830     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1831     _PAGE_SIZE = 12
1832     IE_NAME = u'blip.tv:user'
1833
1834     def __init__(self, downloader=None):
1835         InfoExtractor.__init__(self, downloader)
1836
1837     def report_download_page(self, username, pagenum):
1838         """Report attempt to download user page."""
1839         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1840                 (self.IE_NAME, username, pagenum))
1841
1842     def _real_extract(self, url):
1843         # Extract username
1844         mobj = re.match(self._VALID_URL, url)
1845         if mobj is None:
1846             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1847             return
1848
1849         username = mobj.group(1)
1850
1851         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1852
1853         request = compat_urllib_request.Request(url)
1854
1855         try:
1856             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1857             mobj = re.search(r'data-users-id="([^"]+)"', page)
1858             page_base = page_base % mobj.group(1)
1859         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1860             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1861             return
1862
1863
1864         # Download video ids using BlipTV Ajax calls. Result size per
1865         # query is limited (currently to 12 videos) so we need to query
1866         # page by page until there are no video ids - it means we got
1867         # all of them.
1868
1869         video_ids = []
1870         pagenum = 1
1871
1872         while True:
1873             self.report_download_page(username, pagenum)
1874
1875             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1876
1877             try:
1878                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1879             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1880                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1881                 return
1882
1883             # Extract video identifiers
1884             ids_in_page = []
1885
1886             for mobj in re.finditer(r'href="/([^"]+)"', page):
1887                 if mobj.group(1) not in ids_in_page:
1888                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1889
1890             video_ids.extend(ids_in_page)
1891
1892             # A little optimization - if current page is not
1893             # "full", ie. does not contain PAGE_SIZE video ids then
1894             # we can assume that this page is the last one - there
1895             # are no more ids on further pages - no need to query
1896             # again.
1897
1898             if len(ids_in_page) < self._PAGE_SIZE:
1899                 break
1900
1901             pagenum += 1
1902
1903         all_ids_count = len(video_ids)
1904         playliststart = self._downloader.params.get('playliststart', 1) - 1
1905         playlistend = self._downloader.params.get('playlistend', -1)
1906
1907         if playlistend == -1:
1908             video_ids = video_ids[playliststart:]
1909         else:
1910             video_ids = video_ids[playliststart:playlistend]
1911
1912         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1913                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1914
1915         for video_id in video_ids:
1916             self._downloader.download([u'http://blip.tv/'+video_id])
1917
1918
1919 class DepositFilesIE(InfoExtractor):
1920     """Information extractor for depositfiles.com"""
1921
1922     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1923
1924     def report_download_webpage(self, file_id):
1925         """Report webpage download."""
1926         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1927
1928     def report_extraction(self, file_id):
1929         """Report information extraction."""
1930         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1931
1932     def _real_extract(self, url):
1933         file_id = url.split('/')[-1]
1934         # Rebuild url in english locale
1935         url = 'http://depositfiles.com/en/files/' + file_id
1936
1937         # Retrieve file webpage with 'Free download' button pressed
1938         free_download_indication = { 'gateway_result' : '1' }
1939         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1940         try:
1941             self.report_download_webpage(file_id)
1942             webpage = compat_urllib_request.urlopen(request).read()
1943         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1944             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1945             return
1946
1947         # Search for the real file URL
1948         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1949         if (mobj is None) or (mobj.group(1) is None):
1950             # Try to figure out reason of the error.
1951             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1952             if (mobj is not None) and (mobj.group(1) is not None):
1953                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1954                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1955             else:
1956                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1957             return
1958
1959         file_url = mobj.group(1)
1960         file_extension = os.path.splitext(file_url)[1][1:]
1961
1962         # Search for file title
1963         mobj = re.search(r'<b title="(.*?)">', webpage)
1964         if mobj is None:
1965             self._downloader.trouble(u'ERROR: unable to extract title')
1966             return
1967         file_title = mobj.group(1).decode('utf-8')
1968
1969         return [{
1970             'id':       file_id.decode('utf-8'),
1971             'url':      file_url.decode('utf-8'),
1972             'uploader': None,
1973             'upload_date':  None,
1974             'title':    file_title,
1975             'ext':      file_extension.decode('utf-8'),
1976         }]
1977
1978
1979 class FacebookIE(InfoExtractor):
1980     """Information Extractor for Facebook"""
1981
1982     _WORKING = False
1983     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1984     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1985     _NETRC_MACHINE = 'facebook'
1986     _available_formats = ['video', 'highqual', 'lowqual']
1987     _video_extensions = {
1988         'video': 'mp4',
1989         'highqual': 'mp4',
1990         'lowqual': 'mp4',
1991     }
1992     IE_NAME = u'facebook'
1993
1994     def __init__(self, downloader=None):
1995         InfoExtractor.__init__(self, downloader)
1996
1997     def _reporter(self, message):
1998         """Add header and report message."""
1999         self._downloader.to_screen(u'[facebook] %s' % message)
2000
2001     def report_login(self):
2002         """Report attempt to log in."""
2003         self._reporter(u'Logging in')
2004
2005     def report_video_webpage_download(self, video_id):
2006         """Report attempt to download video webpage."""
2007         self._reporter(u'%s: Downloading video webpage' % video_id)
2008
2009     def report_information_extraction(self, video_id):
2010         """Report attempt to extract video information."""
2011         self._reporter(u'%s: Extracting video information' % video_id)
2012
2013     def _parse_page(self, video_webpage):
2014         """Extract video information from page"""
2015         # General data
2016         data = {'title': r'\("video_title", "(.*?)"\)',
2017             'description': r'<div class="datawrap">(.*?)</div>',
2018             'owner': r'\("video_owner_name", "(.*?)"\)',
2019             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2020             }
2021         video_info = {}
2022         for piece in data.keys():
2023             mobj = re.search(data[piece], video_webpage)
2024             if mobj is not None:
2025                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2026
2027         # Video urls
2028         video_urls = {}
2029         for fmt in self._available_formats:
2030             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2031             if mobj is not None:
2032                 # URL is in a Javascript segment inside an escaped Unicode format within
2033                 # the generally utf-8 page
2034                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2035         video_info['video_urls'] = video_urls
2036
2037         return video_info
2038
2039     def _real_initialize(self):
2040         if self._downloader is None:
2041             return
2042
2043         useremail = None
2044         password = None
2045         downloader_params = self._downloader.params
2046
2047         # Attempt to use provided username and password or .netrc data
2048         if downloader_params.get('username', None) is not None:
2049             useremail = downloader_params['username']
2050             password = downloader_params['password']
2051         elif downloader_params.get('usenetrc', False):
2052             try:
2053                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2054                 if info is not None:
2055                     useremail = info[0]
2056                     password = info[2]
2057                 else:
2058                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2059             except (IOError, netrc.NetrcParseError) as err:
2060                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2061                 return
2062
2063         if useremail is None:
2064             return
2065
2066         # Log in
2067         login_form = {
2068             'email': useremail,
2069             'pass': password,
2070             'login': 'Log+In'
2071             }
2072         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2073         try:
2074             self.report_login()
2075             login_results = compat_urllib_request.urlopen(request).read()
2076             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2077                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2078                 return
2079         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2080             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2081             return
2082
2083     def _real_extract(self, url):
2084         mobj = re.match(self._VALID_URL, url)
2085         if mobj is None:
2086             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2087             return
2088         video_id = mobj.group('ID')
2089
2090         # Get video webpage
2091         self.report_video_webpage_download(video_id)
2092         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2093         try:
2094             page = compat_urllib_request.urlopen(request)
2095             video_webpage = page.read()
2096         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2097             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2098             return
2099
2100         # Start extracting information
2101         self.report_information_extraction(video_id)
2102
2103         # Extract information
2104         video_info = self._parse_page(video_webpage)
2105
2106         # uploader
2107         if 'owner' not in video_info:
2108             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2109             return
2110         video_uploader = video_info['owner']
2111
2112         # title
2113         if 'title' not in video_info:
2114             self._downloader.trouble(u'ERROR: unable to extract video title')
2115             return
2116         video_title = video_info['title']
2117         video_title = video_title.decode('utf-8')
2118
2119         # thumbnail image
2120         if 'thumbnail' not in video_info:
2121             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2122             video_thumbnail = ''
2123         else:
2124             video_thumbnail = video_info['thumbnail']
2125
2126         # upload date
2127         upload_date = None
2128         if 'upload_date' in video_info:
2129             upload_time = video_info['upload_date']
2130             timetuple = email.utils.parsedate_tz(upload_time)
2131             if timetuple is not None:
2132                 try:
2133                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2134                 except:
2135                     pass
2136
2137         # description
2138         video_description = video_info.get('description', 'No description available.')
2139
2140         url_map = video_info['video_urls']
2141         if url_map:
2142             # Decide which formats to download
2143             req_format = self._downloader.params.get('format', None)
2144             format_limit = self._downloader.params.get('format_limit', None)
2145
2146             if format_limit is not None and format_limit in self._available_formats:
2147                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2148             else:
2149                 format_list = self._available_formats
2150             existing_formats = [x for x in format_list if x in url_map]
2151             if len(existing_formats) == 0:
2152                 self._downloader.trouble(u'ERROR: no known formats available for video')
2153                 return
2154             if req_format is None:
2155                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2156             elif req_format == 'worst':
2157                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2158             elif req_format == '-1':
2159                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2160             else:
2161                 # Specific format
2162                 if req_format not in url_map:
2163                     self._downloader.trouble(u'ERROR: requested format not available')
2164                     return
2165                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2166
2167         results = []
2168         for format_param, video_real_url in video_url_list:
2169             # Extension
2170             video_extension = self._video_extensions.get(format_param, 'mp4')
2171
2172             results.append({
2173                 'id':       video_id.decode('utf-8'),
2174                 'url':      video_real_url.decode('utf-8'),
2175                 'uploader': video_uploader.decode('utf-8'),
2176                 'upload_date':  upload_date,
2177                 'title':    video_title,
2178                 'ext':      video_extension.decode('utf-8'),
2179                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2180                 'thumbnail':    video_thumbnail.decode('utf-8'),
2181                 'description':  video_description.decode('utf-8'),
2182             })
2183         return results
2184
2185 class BlipTVIE(InfoExtractor):
2186     """Information extractor for blip.tv"""
2187
2188     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2189     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2190     IE_NAME = u'blip.tv'
2191
2192     def report_extraction(self, file_id):
2193         """Report information extraction."""
2194         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2195
2196     def report_direct_download(self, title):
2197         """Report information extraction."""
2198         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2199
2200     def _real_extract(self, url):
2201         mobj = re.match(self._VALID_URL, url)
2202         if mobj is None:
2203             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2204             return
2205
2206         if '?' in url:
2207             cchar = '&'
2208         else:
2209             cchar = '?'
2210         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2211         request = compat_urllib_request.Request(json_url)
2212         self.report_extraction(mobj.group(1))
2213         info = None
2214         try:
2215             urlh = compat_urllib_request.urlopen(request)
2216             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2217                 basename = url.split('/')[-1]
2218                 title,ext = os.path.splitext(basename)
2219                 title = title.decode('UTF-8')
2220                 ext = ext.replace('.', '')
2221                 self.report_direct_download(title)
2222                 info = {
2223                     'id': title,
2224                     'url': url,
2225                     'uploader': None,
2226                     'upload_date': None,
2227                     'title': title,
2228                     'ext': ext,
2229                     'urlhandle': urlh
2230                 }
2231         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2232             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2233             return
2234         if info is None: # Regular URL
2235             try:
2236                 json_code_bytes = urlh.read()
2237                 json_code = json_code_bytes.decode('utf-8')
2238             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2239                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2240                 return
2241
2242             try:
2243                 json_data = json.loads(json_code)
2244                 if 'Post' in json_data:
2245                     data = json_data['Post']
2246                 else:
2247                     data = json_data
2248
2249                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2250                 video_url = data['media']['url']
2251                 umobj = re.match(self._URL_EXT, video_url)
2252                 if umobj is None:
2253                     raise ValueError('Can not determine filename extension')
2254                 ext = umobj.group(1)
2255
2256                 info = {
2257                     'id': data['item_id'],
2258                     'url': video_url,
2259                     'uploader': data['display_name'],
2260                     'upload_date': upload_date,
2261                     'title': data['title'],
2262                     'ext': ext,
2263                     'format': data['media']['mimeType'],
2264                     'thumbnail': data['thumbnailUrl'],
2265                     'description': data['description'],
2266                     'player_url': data['embedUrl']
2267                 }
2268             except (ValueError,KeyError) as err:
2269                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2270                 return
2271
2272         std_headers['User-Agent'] = 'iTunes/10.6.1'
2273         return [info]
2274
2275
2276 class MyVideoIE(InfoExtractor):
2277     """Information Extractor for myvideo.de."""
2278
2279     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2280     IE_NAME = u'myvideo'
2281
2282     def __init__(self, downloader=None):
2283         InfoExtractor.__init__(self, downloader)
2284
2285     def report_extraction(self, video_id):
2286         """Report information extraction."""
2287         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2288
2289     def _real_extract(self,url):
2290         mobj = re.match(self._VALID_URL, url)
2291         if mobj is None:
2292             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2293             return
2294
2295         video_id = mobj.group(1)
2296
2297         # Get video webpage
2298         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2299         webpage = self._download_webpage(webpage_url, video_id)
2300
2301         self.report_extraction(video_id)
2302         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2303                  webpage)
2304         if mobj is None:
2305             self._downloader.trouble(u'ERROR: unable to extract media URL')
2306             return
2307         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2308
2309         mobj = re.search('<title>([^<]+)</title>', webpage)
2310         if mobj is None:
2311             self._downloader.trouble(u'ERROR: unable to extract title')
2312             return
2313
2314         video_title = mobj.group(1)
2315
2316         return [{
2317             'id':       video_id,
2318             'url':      video_url,
2319             'uploader': None,
2320             'upload_date':  None,
2321             'title':    video_title,
2322             'ext':      u'flv',
2323         }]
2324
2325 class ComedyCentralIE(InfoExtractor):
2326     """Information extractor for The Daily Show and Colbert Report """
2327
2328     # urls can be abbreviations like :thedailyshow or :colbert
2329     # urls for episodes like:
2330     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2331     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2332     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2333     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2334                       |(https?://)?(www\.)?
2335                           (?P<showname>thedailyshow|colbertnation)\.com/
2336                          (full-episodes/(?P<episode>.*)|
2337                           (?P<clip>
2338                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2339                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2340                      $"""
2341
2342     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2343
2344     _video_extensions = {
2345         '3500': 'mp4',
2346         '2200': 'mp4',
2347         '1700': 'mp4',
2348         '1200': 'mp4',
2349         '750': 'mp4',
2350         '400': 'mp4',
2351     }
2352     _video_dimensions = {
2353         '3500': '1280x720',
2354         '2200': '960x540',
2355         '1700': '768x432',
2356         '1200': '640x360',
2357         '750': '512x288',
2358         '400': '384x216',
2359     }
2360
2361     def suitable(self, url):
2362         """Receives a URL and returns True if suitable for this IE."""
2363         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2364
2365     def report_extraction(self, episode_id):
2366         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2367
2368     def report_config_download(self, episode_id, media_id):
2369         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2370
2371     def report_index_download(self, episode_id):
2372         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2373
2374     def _print_formats(self, formats):
2375         print('Available formats:')
2376         for x in formats:
2377             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2378
2379
2380     def _real_extract(self, url):
2381         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2382         if mobj is None:
2383             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2384             return
2385
2386         if mobj.group('shortname'):
2387             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2388                 url = u'http://www.thedailyshow.com/full-episodes/'
2389             else:
2390                 url = u'http://www.colbertnation.com/full-episodes/'
2391             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2392             assert mobj is not None
2393
2394         if mobj.group('clip'):
2395             if mobj.group('showname') == 'thedailyshow':
2396                 epTitle = mobj.group('tdstitle')
2397             else:
2398                 epTitle = mobj.group('cntitle')
2399             dlNewest = False
2400         else:
2401             dlNewest = not mobj.group('episode')
2402             if dlNewest:
2403                 epTitle = mobj.group('showname')
2404             else:
2405                 epTitle = mobj.group('episode')
2406
2407         req = compat_urllib_request.Request(url)
2408         self.report_extraction(epTitle)
2409         try:
2410             htmlHandle = compat_urllib_request.urlopen(req)
2411             html = htmlHandle.read()
2412             webpage = html.decode('utf-8')
2413         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2414             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2415             return
2416         if dlNewest:
2417             url = htmlHandle.geturl()
2418             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2419             if mobj is None:
2420                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2421                 return
2422             if mobj.group('episode') == '':
2423                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2424                 return
2425             epTitle = mobj.group('episode')
2426
2427         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2428
2429         if len(mMovieParams) == 0:
2430             # The Colbert Report embeds the information in a without
2431             # a URL prefix; so extract the alternate reference
2432             # and then add the URL prefix manually.
2433
2434             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2435             if len(altMovieParams) == 0:
2436                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2437                 return
2438             else:
2439                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2440
2441         uri = mMovieParams[0][1]
2442         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2443         self.report_index_download(epTitle)
2444         try:
2445             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2446         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2447             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2448             return
2449
2450         results = []
2451
2452         idoc = xml.etree.ElementTree.fromstring(indexXml)
2453         itemEls = idoc.findall('.//item')
2454         for partNum,itemEl in enumerate(itemEls):
2455             mediaId = itemEl.findall('./guid')[0].text
2456             shortMediaId = mediaId.split(':')[-1]
2457             showId = mediaId.split(':')[-2].replace('.com', '')
2458             officialTitle = itemEl.findall('./title')[0].text
2459             officialDate = itemEl.findall('./pubDate')[0].text
2460
2461             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2462                         compat_urllib_parse.urlencode({'uri': mediaId}))
2463             configReq = compat_urllib_request.Request(configUrl)
2464             self.report_config_download(epTitle, shortMediaId)
2465             try:
2466                 configXml = compat_urllib_request.urlopen(configReq).read()
2467             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2468                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2469                 return
2470
2471             cdoc = xml.etree.ElementTree.fromstring(configXml)
2472             turls = []
2473             for rendition in cdoc.findall('.//rendition'):
2474                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2475                 turls.append(finfo)
2476
2477             if len(turls) == 0:
2478                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2479                 continue
2480
2481             if self._downloader.params.get('listformats', None):
2482                 self._print_formats([i[0] for i in turls])
2483                 return
2484
2485             # For now, just pick the highest bitrate
2486             format,rtmp_video_url = turls[-1]
2487
2488             # Get the format arg from the arg stream
2489             req_format = self._downloader.params.get('format', None)
2490
2491             # Select format if we can find one
2492             for f,v in turls:
2493                 if f == req_format:
2494                     format, rtmp_video_url = f, v
2495                     break
2496
2497             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2498             if not m:
2499                 raise ExtractorError(u'Cannot transform RTMP url')
2500             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2501             video_url = base + m.group('finalid')
2502
2503             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2504             info = {
2505                 'id': shortMediaId,
2506                 'url': video_url,
2507                 'uploader': showId,
2508                 'upload_date': officialDate,
2509                 'title': effTitle,
2510                 'ext': 'mp4',
2511                 'format': format,
2512                 'thumbnail': None,
2513                 'description': officialTitle,
2514             }
2515             results.append(info)
2516
2517         return results
2518
2519
2520 class EscapistIE(InfoExtractor):
2521     """Information extractor for The Escapist """
2522
2523     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2524     IE_NAME = u'escapist'
2525
2526     def report_extraction(self, showName):
2527         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2528
2529     def report_config_download(self, showName):
2530         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2531
2532     def _real_extract(self, url):
2533         mobj = re.match(self._VALID_URL, url)
2534         if mobj is None:
2535             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2536             return
2537         showName = mobj.group('showname')
2538         videoId = mobj.group('episode')
2539
2540         self.report_extraction(showName)
2541         try:
2542             webPage = compat_urllib_request.urlopen(url)
2543             webPageBytes = webPage.read()
2544             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2545             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2546         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2547             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2548             return
2549
2550         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2551         description = unescapeHTML(descMatch.group(1))
2552         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2553         imgUrl = unescapeHTML(imgMatch.group(1))
2554         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2555         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2556         configUrlMatch = re.search('config=(.*)$', playerUrl)
2557         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2558
2559         self.report_config_download(showName)
2560         try:
2561             configJSON = compat_urllib_request.urlopen(configUrl)
2562             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2563             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2564         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2565             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2566             return
2567
2568         # Technically, it's JavaScript, not JSON
2569         configJSON = configJSON.replace("'", '"')
2570
2571         try:
2572             config = json.loads(configJSON)
2573         except (ValueError,) as err:
2574             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2575             return
2576
2577         playlist = config['playlist']
2578         videoUrl = playlist[1]['url']
2579
2580         info = {
2581             'id': videoId,
2582             'url': videoUrl,
2583             'uploader': showName,
2584             'upload_date': None,
2585             'title': showName,
2586             'ext': 'flv',
2587             'thumbnail': imgUrl,
2588             'description': description,
2589             'player_url': playerUrl,
2590         }
2591
2592         return [info]
2593
2594 class CollegeHumorIE(InfoExtractor):
2595     """Information extractor for collegehumor.com"""
2596
2597     _WORKING = False
2598     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2599     IE_NAME = u'collegehumor'
2600
2601     def report_manifest(self, video_id):
2602         """Report information extraction."""
2603         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2604
2605     def report_extraction(self, video_id):
2606         """Report information extraction."""
2607         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2608
2609     def _real_extract(self, url):
2610         mobj = re.match(self._VALID_URL, url)
2611         if mobj is None:
2612             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2613             return
2614         video_id = mobj.group('videoid')
2615
2616         info = {
2617             'id': video_id,
2618             'uploader': None,
2619             'upload_date': None,
2620         }
2621
2622         self.report_extraction(video_id)
2623         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2624         try:
2625             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2626         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2627             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2628             return
2629
2630         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2631         try:
2632             videoNode = mdoc.findall('./video')[0]
2633             info['description'] = videoNode.findall('./description')[0].text
2634             info['title'] = videoNode.findall('./caption')[0].text
2635             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2636             manifest_url = videoNode.findall('./file')[0].text
2637         except IndexError:
2638             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2639             return
2640
2641         manifest_url += '?hdcore=2.10.3'
2642         self.report_manifest(video_id)
2643         try:
2644             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2645         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2646             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2647             return
2648
2649         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2650         try:
2651             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2652             node_id = media_node.attrib['url']
2653             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2654         except IndexError as err:
2655             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2656             return
2657
2658         url_pr = compat_urllib_parse_urlparse(manifest_url)
2659         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2660
2661         info['url'] = url
2662         info['ext'] = 'f4f'
2663         return [info]
2664
2665
2666 class XVideosIE(InfoExtractor):
2667     """Information extractor for xvideos.com"""
2668
2669     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2670     IE_NAME = u'xvideos'
2671
2672     def report_extraction(self, video_id):
2673         """Report information extraction."""
2674         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2675
2676     def _real_extract(self, url):
2677         mobj = re.match(self._VALID_URL, url)
2678         if mobj is None:
2679             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2680             return
2681         video_id = mobj.group(1)
2682
2683         webpage = self._download_webpage(url, video_id)
2684
2685         self.report_extraction(video_id)
2686
2687
2688         # Extract video URL
2689         mobj = re.search(r'flv_url=(.+?)&', webpage)
2690         if mobj is None:
2691             self._downloader.trouble(u'ERROR: unable to extract video url')
2692             return
2693         video_url = compat_urllib_parse.unquote(mobj.group(1))
2694
2695
2696         # Extract title
2697         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2698         if mobj is None:
2699             self._downloader.trouble(u'ERROR: unable to extract video title')
2700             return
2701         video_title = mobj.group(1)
2702
2703
2704         # Extract video thumbnail
2705         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2706         if mobj is None:
2707             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2708             return
2709         video_thumbnail = mobj.group(0)
2710
2711         info = {
2712             'id': video_id,
2713             'url': video_url,
2714             'uploader': None,
2715             'upload_date': None,
2716             'title': video_title,
2717             'ext': 'flv',
2718             'thumbnail': video_thumbnail,
2719             'description': None,
2720         }
2721
2722         return [info]
2723
2724
2725 class SoundcloudIE(InfoExtractor):
2726     """Information extractor for soundcloud.com
2727        To access the media, the uid of the song and a stream token
2728        must be extracted from the page source and the script must make
2729        a request to media.soundcloud.com/crossdomain.xml. Then
2730        the media can be grabbed by requesting from an url composed
2731        of the stream token and uid
2732      """
2733
2734     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2735     IE_NAME = u'soundcloud'
2736
2737     def __init__(self, downloader=None):
2738         InfoExtractor.__init__(self, downloader)
2739
2740     def report_resolve(self, video_id):
2741         """Report information extraction."""
2742         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2743
2744     def report_extraction(self, video_id):
2745         """Report information extraction."""
2746         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2747
2748     def _real_extract(self, url):
2749         mobj = re.match(self._VALID_URL, url)
2750         if mobj is None:
2751             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2752             return
2753
2754         # extract uploader (which is in the url)
2755         uploader = mobj.group(1)
2756         # extract simple title (uploader + slug of song title)
2757         slug_title =  mobj.group(2)
2758         simple_title = uploader + u'-' + slug_title
2759
2760         self.report_resolve('%s/%s' % (uploader, slug_title))
2761
2762         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2763         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2764         request = compat_urllib_request.Request(resolv_url)
2765         try:
2766             info_json_bytes = compat_urllib_request.urlopen(request).read()
2767             info_json = info_json_bytes.decode('utf-8')
2768         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2769             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2770             return
2771
2772         info = json.loads(info_json)
2773         video_id = info['id']
2774         self.report_extraction('%s/%s' % (uploader, slug_title))
2775
2776         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2777         request = compat_urllib_request.Request(streams_url)
2778         try:
2779             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2780             stream_json = stream_json_bytes.decode('utf-8')
2781         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2782             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2783             return
2784
2785         streams = json.loads(stream_json)
2786         mediaURL = streams['http_mp3_128_url']
2787
2788         return [{
2789             'id':       info['id'],
2790             'url':      mediaURL,
2791             'uploader': info['user']['username'],
2792             'upload_date':  info['created_at'],
2793             'title':    info['title'],
2794             'ext':      u'mp3',
2795             'description': info['description'],
2796         }]
2797
2798
2799 class InfoQIE(InfoExtractor):
2800     """Information extractor for infoq.com"""
2801     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2802
2803     def report_extraction(self, video_id):
2804         """Report information extraction."""
2805         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2806
2807     def _real_extract(self, url):
2808         mobj = re.match(self._VALID_URL, url)
2809         if mobj is None:
2810             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2811             return
2812
2813         webpage = self._download_webpage(url, video_id=url)
2814         self.report_extraction(url)
2815
2816         # Extract video URL
2817         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2818         if mobj is None:
2819             self._downloader.trouble(u'ERROR: unable to extract video url')
2820             return
2821         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2822         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2823
2824         # Extract title
2825         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2826         if mobj is None:
2827             self._downloader.trouble(u'ERROR: unable to extract video title')
2828             return
2829         video_title = mobj.group(1)
2830
2831         # Extract description
2832         video_description = u'No description available.'
2833         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2834         if mobj is not None:
2835             video_description = mobj.group(1)
2836
2837         video_filename = video_url.split('/')[-1]
2838         video_id, extension = video_filename.split('.')
2839
2840         info = {
2841             'id': video_id,
2842             'url': video_url,
2843             'uploader': None,
2844             'upload_date': None,
2845             'title': video_title,
2846             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2847             'thumbnail': None,
2848             'description': video_description,
2849         }
2850
2851         return [info]
2852
2853 class MixcloudIE(InfoExtractor):
2854     """Information extractor for www.mixcloud.com"""
2855
2856     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2857     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2858     IE_NAME = u'mixcloud'
2859
2860     def __init__(self, downloader=None):
2861         InfoExtractor.__init__(self, downloader)
2862
2863     def report_download_json(self, file_id):
2864         """Report JSON download."""
2865         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2866
2867     def report_extraction(self, file_id):
2868         """Report information extraction."""
2869         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2870
2871     def get_urls(self, jsonData, fmt, bitrate='best'):
2872         """Get urls from 'audio_formats' section in json"""
2873         file_url = None
2874         try:
2875             bitrate_list = jsonData[fmt]
2876             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2877                 bitrate = max(bitrate_list) # select highest
2878
2879             url_list = jsonData[fmt][bitrate]
2880         except TypeError: # we have no bitrate info.
2881             url_list = jsonData[fmt]
2882         return url_list
2883
2884     def check_urls(self, url_list):
2885         """Returns 1st active url from list"""
2886         for url in url_list:
2887             try:
2888                 compat_urllib_request.urlopen(url)
2889                 return url
2890             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2891                 url = None
2892
2893         return None
2894
2895     def _print_formats(self, formats):
2896         print('Available formats:')
2897         for fmt in formats.keys():
2898             for b in formats[fmt]:
2899                 try:
2900                     ext = formats[fmt][b][0]
2901                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2902                 except TypeError: # we have no bitrate info
2903                     ext = formats[fmt][0]
2904                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2905                     break
2906
2907     def _real_extract(self, url):
2908         mobj = re.match(self._VALID_URL, url)
2909         if mobj is None:
2910             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2911             return
2912         # extract uploader & filename from url
2913         uploader = mobj.group(1).decode('utf-8')
2914         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2915
2916         # construct API request
2917         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2918         # retrieve .json file with links to files
2919         request = compat_urllib_request.Request(file_url)
2920         try:
2921             self.report_download_json(file_url)
2922             jsonData = compat_urllib_request.urlopen(request).read()
2923         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2924             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2925             return
2926
2927         # parse JSON
2928         json_data = json.loads(jsonData)
2929         player_url = json_data['player_swf_url']
2930         formats = dict(json_data['audio_formats'])
2931
2932         req_format = self._downloader.params.get('format', None)
2933         bitrate = None
2934
2935         if self._downloader.params.get('listformats', None):
2936             self._print_formats(formats)
2937             return
2938
2939         if req_format is None or req_format == 'best':
2940             for format_param in formats.keys():
2941                 url_list = self.get_urls(formats, format_param)
2942                 # check urls
2943                 file_url = self.check_urls(url_list)
2944                 if file_url is not None:
2945                     break # got it!
2946         else:
2947             if req_format not in formats:
2948                 self._downloader.trouble(u'ERROR: format is not available')
2949                 return
2950
2951             url_list = self.get_urls(formats, req_format)
2952             file_url = self.check_urls(url_list)
2953             format_param = req_format
2954
2955         return [{
2956             'id': file_id.decode('utf-8'),
2957             'url': file_url.decode('utf-8'),
2958             'uploader': uploader.decode('utf-8'),
2959             'upload_date': None,
2960             'title': json_data['name'],
2961             'ext': file_url.split('.')[-1].decode('utf-8'),
2962             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2963             'thumbnail': json_data['thumbnail_url'],
2964             'description': json_data['description'],
2965             'player_url': player_url.decode('utf-8'),
2966         }]
2967
2968 class StanfordOpenClassroomIE(InfoExtractor):
2969     """Information extractor for Stanford's Open ClassRoom"""
2970
2971     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2972     IE_NAME = u'stanfordoc'
2973
2974     def report_download_webpage(self, objid):
2975         """Report information extraction."""
2976         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2977
2978     def report_extraction(self, video_id):
2979         """Report information extraction."""
2980         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2981
2982     def _real_extract(self, url):
2983         mobj = re.match(self._VALID_URL, url)
2984         if mobj is None:
2985             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2986             return
2987
2988         if mobj.group('course') and mobj.group('video'): # A specific video
2989             course = mobj.group('course')
2990             video = mobj.group('video')
2991             info = {
2992                 'id': course + '_' + video,
2993                 'uploader': None,
2994                 'upload_date': None,
2995             }
2996
2997             self.report_extraction(info['id'])
2998             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2999             xmlUrl = baseUrl + video + '.xml'
3000             try:
3001                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3002             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3003                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3004                 return
3005             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3006             try:
3007                 info['title'] = mdoc.findall('./title')[0].text
3008                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3009             except IndexError:
3010                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3011                 return
3012             info['ext'] = info['url'].rpartition('.')[2]
3013             return [info]
3014         elif mobj.group('course'): # A course page
3015             course = mobj.group('course')
3016             info = {
3017                 'id': course,
3018                 'type': 'playlist',
3019                 'uploader': None,
3020                 'upload_date': None,
3021             }
3022
3023             self.report_download_webpage(info['id'])
3024             try:
3025                 coursepage = compat_urllib_request.urlopen(url).read()
3026             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3027                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3028                 return
3029
3030             m = re.search('<h1>([^<]+)</h1>', coursepage)
3031             if m:
3032                 info['title'] = unescapeHTML(m.group(1))
3033             else:
3034                 info['title'] = info['id']
3035
3036             m = re.search('<description>([^<]+)</description>', coursepage)
3037             if m:
3038                 info['description'] = unescapeHTML(m.group(1))
3039
3040             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3041             info['list'] = [
3042                 {
3043                     'type': 'reference',
3044                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3045                 }
3046                     for vpage in links]
3047             results = []
3048             for entry in info['list']:
3049                 assert entry['type'] == 'reference'
3050                 results += self.extract(entry['url'])
3051             return results
3052
3053         else: # Root page
3054             info = {
3055                 'id': 'Stanford OpenClassroom',
3056                 'type': 'playlist',
3057                 'uploader': None,
3058                 'upload_date': None,
3059             }
3060
3061             self.report_download_webpage(info['id'])
3062             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3063             try:
3064                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3065             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3066                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3067                 return
3068
3069             info['title'] = info['id']
3070
3071             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3072             info['list'] = [
3073                 {
3074                     'type': 'reference',
3075                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3076                 }
3077                     for cpage in links]
3078
3079             results = []
3080             for entry in info['list']:
3081                 assert entry['type'] == 'reference'
3082                 results += self.extract(entry['url'])
3083             return results
3084
3085 class MTVIE(InfoExtractor):
3086     """Information extractor for MTV.com"""
3087
3088     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3089     IE_NAME = u'mtv'
3090
3091     def report_extraction(self, video_id):
3092         """Report information extraction."""
3093         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3094
3095     def _real_extract(self, url):
3096         mobj = re.match(self._VALID_URL, url)
3097         if mobj is None:
3098             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3099             return
3100         if not mobj.group('proto'):
3101             url = 'http://' + url
3102         video_id = mobj.group('videoid')
3103
3104         webpage = self._download_webpage(url, video_id)
3105
3106         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3107         if mobj is None:
3108             self._downloader.trouble(u'ERROR: unable to extract song name')
3109             return
3110         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3111         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3112         if mobj is None:
3113             self._downloader.trouble(u'ERROR: unable to extract performer')
3114             return
3115         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3116         video_title = performer + ' - ' + song_name
3117
3118         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3119         if mobj is None:
3120             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3121             return
3122         mtvn_uri = mobj.group(1)
3123
3124         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3125         if mobj is None:
3126             self._downloader.trouble(u'ERROR: unable to extract content id')
3127             return
3128         content_id = mobj.group(1)
3129
3130         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3131         self.report_extraction(video_id)
3132         request = compat_urllib_request.Request(videogen_url)
3133         try:
3134             metadataXml = compat_urllib_request.urlopen(request).read()
3135         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3136             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3137             return
3138
3139         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3140         renditions = mdoc.findall('.//rendition')
3141
3142         # For now, always pick the highest quality.
3143         rendition = renditions[-1]
3144
3145         try:
3146             _,_,ext = rendition.attrib['type'].partition('/')
3147             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3148             video_url = rendition.find('./src').text
3149         except KeyError:
3150             self._downloader.trouble('Invalid rendition field.')
3151             return
3152
3153         info = {
3154             'id': video_id,
3155             'url': video_url,
3156             'uploader': performer,
3157             'upload_date': None,
3158             'title': video_title,
3159             'ext': ext,
3160             'format': format,
3161         }
3162
3163         return [info]
3164
3165
3166 class YoukuIE(InfoExtractor):
3167     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3168
3169     def report_download_webpage(self, file_id):
3170         """Report webpage download."""
3171         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3172
3173     def report_extraction(self, file_id):
3174         """Report information extraction."""
3175         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3176
3177     def _gen_sid(self):
3178         nowTime = int(time.time() * 1000)
3179         random1 = random.randint(1000,1998)
3180         random2 = random.randint(1000,9999)
3181
3182         return "%d%d%d" %(nowTime,random1,random2)
3183
3184     def _get_file_ID_mix_string(self, seed):
3185         mixed = []
3186         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3187         seed = float(seed)
3188         for i in range(len(source)):
3189             seed  =  (seed * 211 + 30031 ) % 65536
3190             index  =  math.floor(seed / 65536 * len(source) )
3191             mixed.append(source[int(index)])
3192             source.remove(source[int(index)])
3193         #return ''.join(mixed)
3194         return mixed
3195
3196     def _get_file_id(self, fileId, seed):
3197         mixed = self._get_file_ID_mix_string(seed)
3198         ids = fileId.split('*')
3199         realId = []
3200         for ch in ids:
3201             if ch:
3202                 realId.append(mixed[int(ch)])
3203         return ''.join(realId)
3204
3205     def _real_extract(self, url):
3206         mobj = re.match(self._VALID_URL, url)
3207         if mobj is None:
3208             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3209             return
3210         video_id = mobj.group('ID')
3211
3212         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3213
3214         request = compat_urllib_request.Request(info_url, None, std_headers)
3215         try:
3216             self.report_download_webpage(video_id)
3217             jsondata = compat_urllib_request.urlopen(request).read()
3218         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3219             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3220             return
3221
3222         self.report_extraction(video_id)
3223         try:
3224             jsonstr = jsondata.decode('utf-8')
3225             config = json.loads(jsonstr)
3226
3227             video_title =  config['data'][0]['title']
3228             seed = config['data'][0]['seed']
3229
3230             format = self._downloader.params.get('format', None)
3231             supported_format = list(config['data'][0]['streamfileids'].keys())
3232
3233             if format is None or format == 'best':
3234                 if 'hd2' in supported_format:
3235                     format = 'hd2'
3236                 else:
3237                     format = 'flv'
3238                 ext = u'flv'
3239             elif format == 'worst':
3240                 format = 'mp4'
3241                 ext = u'mp4'
3242             else:
3243                 format = 'flv'
3244                 ext = u'flv'
3245
3246
3247             fileid = config['data'][0]['streamfileids'][format]
3248             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3249         except (UnicodeDecodeError, ValueError, KeyError):
3250             self._downloader.trouble(u'ERROR: unable to extract info section')
3251             return
3252
3253         files_info=[]
3254         sid = self._gen_sid()
3255         fileid = self._get_file_id(fileid, seed)
3256
3257         #column 8,9 of fileid represent the segment number
3258         #fileid[7:9] should be changed
3259         for index, key in enumerate(keys):
3260
3261             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3262             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3263
3264             info = {
3265                 'id': '%s_part%02d' % (video_id, index),
3266                 'url': download_url,
3267                 'uploader': None,
3268                 'upload_date': None,
3269                 'title': video_title,
3270                 'ext': ext,
3271             }
3272             files_info.append(info)
3273
3274         return files_info
3275
3276
3277 class XNXXIE(InfoExtractor):
3278     """Information extractor for xnxx.com"""
3279
3280     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3281     IE_NAME = u'xnxx'
3282     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3283     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3284     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3285
3286     def report_webpage(self, video_id):
3287         """Report information extraction"""
3288         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3289
3290     def report_extraction(self, video_id):
3291         """Report information extraction"""
3292         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3293
3294     def _real_extract(self, url):
3295         mobj = re.match(self._VALID_URL, url)
3296         if mobj is None:
3297             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3298             return
3299         video_id = mobj.group(1)
3300
3301         self.report_webpage(video_id)
3302
3303         # Get webpage content
3304         try:
3305             webpage_bytes = compat_urllib_request.urlopen(url).read()
3306             webpage = webpage_bytes.decode('utf-8')
3307         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3308             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3309             return
3310
3311         result = re.search(self.VIDEO_URL_RE, webpage)
3312         if result is None:
3313             self._downloader.trouble(u'ERROR: unable to extract video url')
3314             return
3315         video_url = compat_urllib_parse.unquote(result.group(1))
3316
3317         result = re.search(self.VIDEO_TITLE_RE, webpage)
3318         if result is None:
3319             self._downloader.trouble(u'ERROR: unable to extract video title')
3320             return
3321         video_title = result.group(1)
3322
3323         result = re.search(self.VIDEO_THUMB_RE, webpage)
3324         if result is None:
3325             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3326             return
3327         video_thumbnail = result.group(1)
3328
3329         return [{
3330             'id': video_id,
3331             'url': video_url,
3332             'uploader': None,
3333             'upload_date': None,
3334             'title': video_title,
3335             'ext': 'flv',
3336             'thumbnail': video_thumbnail,
3337             'description': None,
3338         }]
3339
3340
3341 class GooglePlusIE(InfoExtractor):
3342     """Information extractor for plus.google.com."""
3343
3344     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3345     IE_NAME = u'plus.google'
3346
3347     def __init__(self, downloader=None):
3348         InfoExtractor.__init__(self, downloader)
3349
3350     def report_extract_entry(self, url):
3351         """Report downloading extry"""
3352         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3353
3354     def report_date(self, upload_date):
3355         """Report downloading extry"""
3356         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3357
3358     def report_uploader(self, uploader):
3359         """Report downloading extry"""
3360         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3361
3362     def report_title(self, video_title):
3363         """Report downloading extry"""
3364         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3365
3366     def report_extract_vid_page(self, video_page):
3367         """Report information extraction."""
3368         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3369
3370     def _real_extract(self, url):
3371         # Extract id from URL
3372         mobj = re.match(self._VALID_URL, url)
3373         if mobj is None:
3374             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3375             return
3376
3377         post_url = mobj.group(0)
3378         video_id = mobj.group(1)
3379
3380         video_extension = 'flv'
3381
3382         # Step 1, Retrieve post webpage to extract further information
3383         self.report_extract_entry(post_url)
3384         request = compat_urllib_request.Request(post_url)
3385         try:
3386             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3388             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3389             return
3390
3391         # Extract update date
3392         upload_date = None
3393         pattern = 'title="Timestamp">(.*?)</a>'
3394         mobj = re.search(pattern, webpage)
3395         if mobj:
3396             upload_date = mobj.group(1)
3397             # Convert timestring to a format suitable for filename
3398             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3399             upload_date = upload_date.strftime('%Y%m%d')
3400         self.report_date(upload_date)
3401
3402         # Extract uploader
3403         uploader = None
3404         pattern = r'rel\="author".*?>(.*?)</a>'
3405         mobj = re.search(pattern, webpage)
3406         if mobj:
3407             uploader = mobj.group(1)
3408         self.report_uploader(uploader)
3409
3410         # Extract title
3411         # Get the first line for title
3412         video_title = u'NA'
3413         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3414         mobj = re.search(pattern, webpage)
3415         if mobj:
3416             video_title = mobj.group(1)
3417         self.report_title(video_title)
3418
3419         # Step 2, Stimulate clicking the image box to launch video
3420         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3421         mobj = re.search(pattern, webpage)
3422         if mobj is None:
3423             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3424
3425         video_page = mobj.group(1)
3426         request = compat_urllib_request.Request(video_page)
3427         try:
3428             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3429         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3430             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3431             return
3432         self.report_extract_vid_page(video_page)
3433
3434
3435         # Extract video links on video page
3436         """Extract video links of all sizes"""
3437         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3438         mobj = re.findall(pattern, webpage)
3439         if len(mobj) == 0:
3440             self._downloader.trouble(u'ERROR: unable to extract video links')
3441
3442         # Sort in resolution
3443         links = sorted(mobj)
3444
3445         # Choose the lowest of the sort, i.e. highest resolution
3446         video_url = links[-1]
3447         # Only get the url. The resolution part in the tuple has no use anymore
3448         video_url = video_url[-1]
3449         # Treat escaped \u0026 style hex
3450         try:
3451             video_url = video_url.decode("unicode_escape")
3452         except AttributeError: # Python 3
3453             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3454
3455
3456         return [{
3457             'id':       video_id,
3458             'url':      video_url,
3459             'uploader': uploader,
3460             'upload_date':  upload_date,
3461             'title':    video_title,
3462             'ext':      video_extension,
3463         }]
3464
3465 class NBAIE(InfoExtractor):
3466     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3467     IE_NAME = u'nba'
3468
3469     def _real_extract(self, url):
3470         mobj = re.match(self._VALID_URL, url)
3471         if mobj is None:
3472             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3473             return
3474
3475         video_id = mobj.group(1)
3476         if video_id.endswith('/index.html'):
3477             video_id = video_id[:-len('/index.html')]
3478
3479         webpage = self._download_webpage(url, video_id)
3480
3481         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3482         def _findProp(rexp, default=None):
3483             m = re.search(rexp, webpage)
3484             if m:
3485                 return unescapeHTML(m.group(1))
3486             else:
3487                 return default
3488
3489         shortened_video_id = video_id.rpartition('/')[2]
3490         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3491         info = {
3492             'id': shortened_video_id,
3493             'url': video_url,
3494             'ext': 'mp4',
3495             'title': title,
3496             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3497             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3498         }
3499         return [info]
3500
3501 class JustinTVIE(InfoExtractor):
3502     """Information extractor for justin.tv and twitch.tv"""
3503     # TODO: One broadcast may be split into multiple videos. The key
3504     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3505     # starts at 1 and increases. Can we treat all parts as one video?
3506
3507     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3508         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3509     _JUSTIN_PAGE_LIMIT = 100
3510     IE_NAME = u'justin.tv'
3511
3512     def report_extraction(self, file_id):
3513         """Report information extraction."""
3514         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3515
3516     def report_download_page(self, channel, offset):
3517         """Report attempt to download a single page of videos."""
3518         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3519                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3520
3521     # Return count of items, list of *valid* items
3522     def _parse_page(self, url):
3523         try:
3524             urlh = compat_urllib_request.urlopen(url)
3525             webpage_bytes = urlh.read()
3526             webpage = webpage_bytes.decode('utf-8', 'ignore')
3527         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3528             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3529             return
3530
3531         response = json.loads(webpage)
3532         if type(response) != list:
3533             error_text = response.get('error', 'unknown error')
3534             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3535             return
3536         info = []
3537         for clip in response:
3538             video_url = clip['video_file_url']
3539             if video_url:
3540                 video_extension = os.path.splitext(video_url)[1][1:]
3541                 video_date = re.sub('-', '', clip['start_time'][:10])
3542                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3543                 info.append({
3544                     'id': clip['id'],
3545                     'url': video_url,
3546                     'title': clip['title'],
3547                     'uploader': clip.get('channel_name', video_uploader_id),
3548                     'uploader_id': video_uploader_id,
3549                     'upload_date': video_date,
3550                     'ext': video_extension,
3551                 })
3552         return (len(response), info)
3553
3554     def _real_extract(self, url):
3555         mobj = re.match(self._VALID_URL, url)
3556         if mobj is None:
3557             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3558             return
3559
3560         api = 'http://api.justin.tv'
3561         video_id = mobj.group(mobj.lastindex)
3562         paged = False
3563         if mobj.lastindex == 1:
3564             paged = True
3565             api += '/channel/archives/%s.json'
3566         else:
3567             api += '/broadcast/by_archive/%s.json'
3568         api = api % (video_id,)
3569
3570         self.report_extraction(video_id)
3571
3572         info = []
3573         offset = 0
3574         limit = self._JUSTIN_PAGE_LIMIT
3575         while True:
3576             if paged:
3577                 self.report_download_page(video_id, offset)
3578             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3579             page_count, page_info = self._parse_page(page_url)
3580             info.extend(page_info)
3581             if not paged or page_count != limit:
3582                 break
3583             offset += limit
3584         return info
3585
3586 class FunnyOrDieIE(InfoExtractor):
3587     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3588
3589     def _real_extract(self, url):
3590         mobj = re.match(self._VALID_URL, url)
3591         if mobj is None:
3592             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3593             return
3594
3595         video_id = mobj.group('id')
3596         webpage = self._download_webpage(url, video_id)
3597
3598         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3599         if not m:
3600             self._downloader.trouble(u'ERROR: unable to find video information')
3601         video_url = unescapeHTML(m.group('url'))
3602
3603         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3604         if not m:
3605             self._downloader.trouble(u'Cannot find video title')
3606         title = unescapeHTML(m.group('title'))
3607
3608         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3609         if m:
3610             desc = unescapeHTML(m.group('desc'))
3611         else:
3612             desc = None
3613
3614         info = {
3615             'id': video_id,
3616             'url': video_url,
3617             'ext': 'mp4',
3618             'title': title,
3619             'description': desc,
3620         }
3621         return [info]
3622
3623 class TweetReelIE(InfoExtractor):
3624     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3625
3626     def _real_extract(self, url):
3627         mobj = re.match(self._VALID_URL, url)
3628         if mobj is None:
3629             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3630             return
3631
3632         video_id = mobj.group('id')
3633         webpage = self._download_webpage(url, video_id)
3634
3635         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3636         if not m:
3637             self._downloader.trouble(u'ERROR: Cannot find status ID')
3638         status_id = m.group(1)
3639
3640         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3641         if not m:
3642             self._downloader.trouble(u'WARNING: Cannot find description')
3643         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3644
3645         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3646         if not m:
3647             self._downloader.trouble(u'ERROR: Cannot find uploader')
3648         uploader = unescapeHTML(m.group('uploader'))
3649         uploader_id = unescapeHTML(m.group('uploader_id'))
3650
3651         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3652         if not m:
3653             self._downloader.trouble(u'ERROR: Cannot find upload date')
3654         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3655
3656         title = desc
3657         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3658
3659         info = {
3660             'id': video_id,
3661             'url': video_url,
3662             'ext': 'mov',
3663             'title': title,
3664             'description': desc,
3665             'uploader': uploader,
3666             'uploader_id': uploader_id,
3667             'internal_id': status_id,
3668             'upload_date': upload_date
3669         }
3670         return [info]
3671
3672 class SteamIE(InfoExtractor):
3673     _VALID_URL = r"""http://store.steampowered.com/
3674                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3675                 (?P<gameID>\d+)/?
3676                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3677                 """
3678
3679     def suitable(self, url):
3680         """Receives a URL and returns True if suitable for this IE."""
3681         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3682
3683     def _real_extract(self, url):
3684         m = re.match(self._VALID_URL, url, re.VERBOSE)
3685         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3686         gameID = m.group('gameID')
3687         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3688         webpage = self._download_webpage(videourl, gameID)
3689         mweb = re.finditer(urlRE, webpage)
3690         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3691         titles = re.finditer(namesRE, webpage)
3692         videos = []
3693         for vid,vtitle in zip(mweb,titles):
3694             video_id = vid.group('videoID')
3695             title = vtitle.group('videoName')
3696             video_url = vid.group('videoURL')
3697             if not video_url:
3698                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3699             info = {
3700                 'id':video_id,
3701                 'url':video_url,
3702                 'ext': 'flv',
3703                 'title': unescapeHTML(title)
3704                   }
3705             videos.append(info)
3706         return videos
3707
3708 class UstreamIE(InfoExtractor):
3709     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3710     IE_NAME = u'ustream'
3711
3712     def _real_extract(self, url):
3713         m = re.match(self._VALID_URL, url)
3714         video_id = m.group('videoID')
3715         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3716         webpage = self._download_webpage(url, video_id)
3717         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3718         title = m.group('title')
3719         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3720         uploader = m.group('uploader')
3721         info = {
3722                 'id':video_id,
3723                 'url':video_url,
3724                 'ext': 'flv',
3725                 'title': title,
3726                 'uploader': uploader
3727                   }
3728         return [info]
3729
3730
3731
3732 class YouPornIE(InfoExtractor):
3733     """Information extractor for youporn.com."""
3734     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3735
3736     def _print_formats(self, formats):
3737         """Print all available formats"""
3738         print(u'Available formats:')
3739         print(u'ext\t\tformat')
3740         print(u'---------------------------------')
3741         for format in formats:
3742             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3743
3744     def _specific(self, req_format, formats):
3745         for x in formats:
3746             if(x["format"]==req_format):
3747                 return x
3748         return None
3749
3750     def _real_extract(self, url):
3751         mobj = re.match(self._VALID_URL, url)
3752         if mobj is None:
3753             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3754             return
3755
3756         video_id = mobj.group('videoid')
3757
3758         req = compat_urllib_request.Request(url)
3759         req.add_header('Cookie', 'age_verified=1')
3760         webpage = self._download_webpage(req, video_id)
3761
3762         # Get the video title
3763         result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3764         if result is None:
3765             raise ExtractorError(u'ERROR: unable to extract video title')
3766         video_title = result.group('title').strip()
3767
3768         # Get the video date
3769         result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3770         if result is None:
3771             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3772             upload_date = None
3773         else:
3774             upload_date = result.group('date').strip()
3775
3776         # Get the video uploader
3777         result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3778         if result is None:
3779             self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3780             video_uploader = None
3781         else:
3782             video_uploader = result.group('uploader').strip()
3783             video_uploader = clean_html( video_uploader )
3784
3785         # Get all of the formats available
3786         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3787         result = re.search(DOWNLOAD_LIST_RE, webpage)
3788         if result is None:
3789             raise ExtractorError(u'Unable to extract download list')
3790         download_list_html = result.group('download_list').strip()
3791
3792         # Get all of the links from the page
3793         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3794         links = re.findall(LINK_RE, download_list_html)
3795         if(len(links) == 0):
3796             raise ExtractorError(u'ERROR: no known formats available for video')
3797
3798         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3799
3800         formats = []
3801         for link in links:
3802
3803             # A link looks like this:
3804             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3805             # A path looks like this:
3806             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3807             video_url = unescapeHTML( link )
3808             path = compat_urllib_parse_urlparse( video_url ).path
3809             extension = os.path.splitext( path )[1][1:]
3810             format = path.split('/')[4].split('_')[:2]
3811             size = format[0]
3812             bitrate = format[1]
3813             format = "-".join( format )
3814             title = u'%s-%s-%s' % (video_title, size, bitrate)
3815
3816             formats.append({
3817                 'id': video_id,
3818                 'url': video_url,
3819                 'uploader': video_uploader,
3820                 'upload_date': upload_date,
3821                 'title': title,
3822                 'ext': extension,
3823                 'format': format,
3824                 'thumbnail': None,
3825                 'description': None,
3826                 'player_url': None
3827             })
3828
3829         if self._downloader.params.get('listformats', None):
3830             self._print_formats(formats)
3831             return
3832
3833         req_format = self._downloader.params.get('format', None)
3834         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3835
3836         if req_format is None or req_format == 'best':
3837             return [formats[0]]
3838         elif req_format == 'worst':
3839             return [formats[-1]]
3840         elif req_format in ('-1', 'all'):
3841             return formats
3842         else:
3843             format = self._specific( req_format, formats )
3844             if result is None:
3845                 self._downloader.trouble(u'ERROR: requested format not available')
3846                 return
3847             return [format]
3848
3849
3850
3851 class PornotubeIE(InfoExtractor):
3852     """Information extractor for pornotube.com."""
3853     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3854
3855     def _real_extract(self, url):
3856         mobj = re.match(self._VALID_URL, url)
3857         if mobj is None:
3858             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3859             return
3860
3861         video_id = mobj.group('videoid')
3862         video_title = mobj.group('title')
3863
3864         # Get webpage content
3865         webpage = self._download_webpage(url, video_id)
3866
3867         # Get the video URL
3868         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3869         result = re.search(VIDEO_URL_RE, webpage)
3870         if result is None:
3871             self._downloader.trouble(u'ERROR: unable to extract video url')
3872             return
3873         video_url = compat_urllib_parse.unquote(result.group('url'))
3874
3875         #Get the uploaded date
3876         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3877         result = re.search(VIDEO_UPLOADED_RE, webpage)
3878         if result is None:
3879             self._downloader.trouble(u'ERROR: unable to extract video title')
3880             return
3881         upload_date = result.group('date')
3882
3883         info = {'id': video_id,
3884                 'url': video_url,
3885                 'uploader': None,
3886                 'upload_date': upload_date,
3887                 'title': video_title,
3888                 'ext': 'flv',
3889                 'format': 'flv'}
3890
3891         return [info]
3892
3893
3894
3895 class YouJizzIE(InfoExtractor):
3896     """Information extractor for youjizz.com."""
3897     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3898
3899     def __init__(self, downloader=None):
3900         InfoExtractor.__init__(self, downloader)
3901
3902     def _real_extract(self, url):
3903         mobj = re.match(self._VALID_URL, url)
3904         if mobj is None:
3905             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3906             return
3907
3908         video_id = mobj.group('videoid')
3909
3910         # Get webpage content
3911         webpage = self._download_webpage(url, video_id)
3912
3913         # Get the video title
3914         VIDEO_TITLE_RE = r'<title>(?P<title>.*)</title>'
3915         result = re.search(VIDEO_TITLE_RE, webpage)
3916         if result is None:
3917             self._downloader.trouble(u'ERROR: unable to extract video title')
3918             return
3919         video_title = result.group('title').strip()
3920
3921         # Get the embed page
3922         EMBED_PAGE_RE = r'http://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)'
3923         result = re.search(EMBED_PAGE_RE, webpage)
3924         if result is None:
3925             self._downloader.trouble(u'ERROR: unable to extract embed page')
3926             return
3927
3928         embed_page_url = result.group(0).strip()
3929         video_id = result.group('videoid')
3930
3931         webpage = self._download_webpage(embed_page_url, video_id)
3932
3933         # Get the video URL
3934         SOURCE_RE = r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);'
3935         result = re.search(SOURCE_RE, webpage)
3936         if result is None:
3937             self._downloader.trouble(u'ERROR: unable to extract video url')
3938             return
3939         video_url = result.group('source')
3940
3941         info = {'id': video_id,
3942                 'url': video_url,
3943                 'uploader': None,
3944                 'upload_date': None,
3945                 'title': video_title,
3946                 'ext': 'flv',
3947                 'format': 'flv',
3948                 'thumbnail': None,
3949                 'description': None,
3950                 'player_url': embed_page_url}
3951
3952         return [info]
3953
3954
3955 def gen_extractors():
3956     """ Return a list of an instance of every supported extractor.
3957     The order does matter; the first extractor matched is the one handling the URL.
3958     """
3959     return [
3960         YoutubePlaylistIE(),
3961         YoutubeChannelIE(),
3962         YoutubeUserIE(),
3963         YoutubeSearchIE(),
3964         YoutubeIE(),
3965         MetacafeIE(),
3966         DailymotionIE(),
3967         GoogleSearchIE(),
3968         PhotobucketIE(),
3969         YahooIE(),
3970         YahooSearchIE(),
3971         DepositFilesIE(),
3972         FacebookIE(),
3973         BlipTVUserIE(),
3974         BlipTVIE(),
3975         VimeoIE(),
3976         MyVideoIE(),
3977         ComedyCentralIE(),
3978         EscapistIE(),
3979         CollegeHumorIE(),
3980         XVideosIE(),
3981         SoundcloudIE(),
3982         InfoQIE(),
3983         MixcloudIE(),
3984         StanfordOpenClassroomIE(),
3985         MTVIE(),
3986         YoukuIE(),
3987         XNXXIE(),
3988         YouJizzIE(),
3989         PornotubeIE(),
3990         YouPornIE(),
3991         GooglePlusIE(),
3992         ArteTvIE(),
3993         NBAIE(),
3994         JustinTVIE(),
3995         FunnyOrDieIE(),
3996         TweetReelIE(),
3997         SteamIE(),
3998         UstreamIE(),
3999         GenericIE()
4000     ]
4001
4002