youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import netrc
   9 import os
  10 import re
  11 import socket
  12 import time
  13 import email.utils
  14 import xml.etree.ElementTree
  15 import random
  16 import math
  17
  18 from .utils import *
  19
  20
  21 class InfoExtractor(object):
  22     """Information Extractor class.
  23
  24     Information extractors are the classes that, given a URL, extract
  25     information about the video (or videos) the URL refers to. This
  26     information includes the real video URL, the video title, author and
  27     others. The information is stored in a dictionary which is then
  28     passed to the FileDownloader. The FileDownloader processes this
  29     information possibly downloading the video to the file system, among
  30     other possible outcomes.
  31
  32     The dictionaries must include the following fields:
  33
  34     id:             Video identifier.
  35     url:            Final video URL.
  36     title:          Video title, unescaped.
  37     ext:            Video filename extension.
  38     uploader:       Full name of the video uploader.
  39     upload_date:    Video upload date (YYYYMMDD).
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader_id:    Nickname or id of the video uploader.
  47     player_url:     SWF Player URL (used for rtmpdump).
  48     subtitles:      The .srt file contents.
  49     urlhandle:      [internal] The urlHandle to be used to download the file,
  50                     like returned by urllib.request.urlopen
  51
  52     The fields should all be Unicode strings.
  53
  54     Subclasses of this one should re-define the _real_initialize() and
  55     _real_extract() methods and define a _VALID_URL regexp.
  56     Probably, they should also be added to the list of extractors.
  57
  58     _real_extract() must return a *list* of information dictionaries as
  59     described above.
  60
  61     Finally, the _WORKING attribute should be set to False for broken IEs
  62     in order to warn the users and skip the tests.
  63     """
  64
  65     _ready = False
  66     _downloader = None
  67     _WORKING = True
  68
  69     def __init__(self, downloader=None):
  70         """Constructor. Receives an optional downloader."""
  71         self._ready = False
  72         self.set_downloader(downloader)
  73
  74     def suitable(self, url):
  75         """Receives a URL and returns True if suitable for this IE."""
  76         return re.match(self._VALID_URL, url) is not None
  77
  78     def working(self):
  79         """Getter method for _WORKING."""
  80         return self._WORKING
  81
  82     def initialize(self):
  83         """Initializes an instance (authentication, etc)."""
  84         if not self._ready:
  85             self._real_initialize()
  86             self._ready = True
  87
  88     def extract(self, url):
  89         """Extracts URL information and returns it in list of dicts."""
  90         self.initialize()
  91         return self._real_extract(url)
  92
  93     def set_downloader(self, downloader):
  94         """Sets the downloader for this IE."""
  95         self._downloader = downloader
  96
  97     def _real_initialize(self):
  98         """Real initialization process. Redefine in subclasses."""
  99         pass
 100
 101     def _real_extract(self, url):
 102         """Real extraction process. Redefine in subclasses."""
 103         pass
 104
 105     @property
 106     def IE_NAME(self):
 107         return type(self).__name__[:-2]
 108
 109     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 110         if note is None:
 111             note = u'Downloading video webpage'
 112         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 113         try:
 114             urlh = compat_urllib_request.urlopen(url_or_request)
 115             webpage_bytes = urlh.read()
 116             return webpage_bytes.decode('utf-8', 'replace')
 117         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 118             if errnote is None:
 119                 errnote = u'Unable to download webpage'
 120             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 121
 122
 123 class YoutubeIE(InfoExtractor):
 124     """Information extractor for youtube.com."""
 125
 126     _VALID_URL = r"""^
 127                      (
 128                          (?:https?://)?                                       # http(s):// (optional)
 129                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 130                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 131                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 132                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 133                          (?:                                                  # the various things that can precede the ID:
 134                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 135                              |(?:                                             # or the v= param in all its forms
 136                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 137                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 138                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 139                                  v=
 140                              )
 141                          )?                                                   # optional -> youtube.com/xxxx is OK
 142                      )?                                                       # all until now is optional -> you can pass the naked ID
 143                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 144                      (?(1).+)?                                                # if we found the ID, everything can follow
 145                      $"""
 146     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 147     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 148     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 149     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 150     _NETRC_MACHINE = 'youtube'
 151     # Listed in order of quality
 152     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 153     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 154     _video_extensions = {
 155         '13': '3gp',
 156         '17': 'mp4',
 157         '18': 'mp4',
 158         '22': 'mp4',
 159         '37': 'mp4',
 160         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 161         '43': 'webm',
 162         '44': 'webm',
 163         '45': 'webm',
 164         '46': 'webm',
 165     }
 166     _video_dimensions = {
 167         '5': '240x400',
 168         '6': '???',
 169         '13': '???',
 170         '17': '144x176',
 171         '18': '360x640',
 172         '22': '720x1280',
 173         '34': '360x640',
 174         '35': '480x854',
 175         '37': '1080x1920',
 176         '38': '3072x4096',
 177         '43': '360x640',
 178         '44': '480x854',
 179         '45': '720x1280',
 180         '46': '1080x1920',
 181     }
 182     IE_NAME = u'youtube'
 183
 184     def suitable(self, url):
 185         """Receives a URL and returns True if suitable for this IE."""
 186         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 187
 188     def report_lang(self):
 189         """Report attempt to set language."""
 190         self._downloader.to_screen(u'[youtube] Setting language')
 191
 192     def report_login(self):
 193         """Report attempt to log in."""
 194         self._downloader.to_screen(u'[youtube] Logging in')
 195
 196     def report_age_confirmation(self):
 197         """Report attempt to confirm age."""
 198         self._downloader.to_screen(u'[youtube] Confirming age')
 199
 200     def report_video_webpage_download(self, video_id):
 201         """Report attempt to download video webpage."""
 202         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 203
 204     def report_video_info_webpage_download(self, video_id):
 205         """Report attempt to download video info webpage."""
 206         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 207
 208     def report_video_subtitles_download(self, video_id):
 209         """Report attempt to download video info webpage."""
 210         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 211
 212     def report_information_extraction(self, video_id):
 213         """Report attempt to extract video information."""
 214         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 215
 216     def report_unavailable_format(self, video_id, format):
 217         """Report extracted video URL."""
 218         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 219
 220     def report_rtmp_download(self):
 221         """Indicate the download will use the RTMP protocol."""
 222         self._downloader.to_screen(u'[youtube] RTMP download detected')
 223
 224     def _closed_captions_xml_to_srt(self, xml_string):
 225         srt = ''
 226         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 227         # TODO parse xml instead of regex
 228         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 229             if not dur: dur = '4'
 230             start = float(start)
 231             end = start + float(dur)
 232             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 233             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 234             caption = unescapeHTML(caption)
 235             caption = unescapeHTML(caption) # double cycle, intentional
 236             srt += str(n+1) + '\n'
 237             srt += start + ' --> ' + end + '\n'
 238             srt += caption + '\n\n'
 239         return srt
 240
 241     def _extract_subtitles(self, video_id):
 242         self.report_video_subtitles_download(video_id)
 243         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 244         try:
 245             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 246         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 247             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 248         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 249         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 250         if not srt_lang_list:
 251             return (u'WARNING: video has no closed captions', None)
 252         if self._downloader.params.get('subtitleslang', False):
 253             srt_lang = self._downloader.params.get('subtitleslang')
 254         elif 'en' in srt_lang_list:
 255             srt_lang = 'en'
 256         else:
 257             srt_lang = list(srt_lang_list.keys())[0]
 258         if not srt_lang in srt_lang_list:
 259             return (u'WARNING: no closed captions found in the specified language', None)
 260         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 261         try:
 262             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 263         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 264             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 265         if not srt_xml:
 266             return (u'WARNING: unable to download video subtitles', None)
 267         return (None, self._closed_captions_xml_to_srt(srt_xml))
 268
 269     def _print_formats(self, formats):
 270         print('Available formats:')
 271         for x in formats:
 272             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 273
 274     def _real_initialize(self):
 275         if self._downloader is None:
 276             return
 277
 278         username = None
 279         password = None
 280         downloader_params = self._downloader.params
 281
 282         # Attempt to use provided username and password or .netrc data
 283         if downloader_params.get('username', None) is not None:
 284             username = downloader_params['username']
 285             password = downloader_params['password']
 286         elif downloader_params.get('usenetrc', False):
 287             try:
 288                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 289                 if info is not None:
 290                     username = info[0]
 291                     password = info[2]
 292                 else:
 293                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 294             except (IOError, netrc.NetrcParseError) as err:
 295                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 296                 return
 297
 298         # Set language
 299         request = compat_urllib_request.Request(self._LANG_URL)
 300         try:
 301             self.report_lang()
 302             compat_urllib_request.urlopen(request).read()
 303         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 304             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 305             return
 306
 307         # No authentication to be performed
 308         if username is None:
 309             return
 310
 311         # Log in
 312         login_form = {
 313                 'current_form': 'loginForm',
 314                 'next':     '/',
 315                 'action_login': 'Log In',
 316                 'username': username,
 317                 'password': password,
 318                 }
 319         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 320         try:
 321             self.report_login()
 322             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 323             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 324                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 325                 return
 326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 327             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 328             return
 329
 330         # Confirm age
 331         age_form = {
 332                 'next_url':     '/',
 333                 'action_confirm':   'Confirm',
 334                 }
 335         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 336         try:
 337             self.report_age_confirmation()
 338             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 339         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 340             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 341             return
 342
 343     def _extract_id(self, url):
 344         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 345         if mobj is None:
 346             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 347             return
 348         video_id = mobj.group(2)
 349         return video_id
 350
 351     def _real_extract(self, url):
 352         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 353         mobj = re.search(self._NEXT_URL_RE, url)
 354         if mobj:
 355             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 356         video_id = self._extract_id(url)
 357
 358         # Get video webpage
 359         self.report_video_webpage_download(video_id)
 360         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 361         request = compat_urllib_request.Request(url)
 362         try:
 363             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 364         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 365             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 366             return
 367
 368         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 369
 370         # Attempt to extract SWF player URL
 371         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 372         if mobj is not None:
 373             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 374         else:
 375             player_url = None
 376
 377         # Get video info
 378         self.report_video_info_webpage_download(video_id)
 379         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 380             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 381                     % (video_id, el_type))
 382             request = compat_urllib_request.Request(video_info_url)
 383             try:
 384                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 385                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 386                 video_info = compat_parse_qs(video_info_webpage)
 387                 if 'token' in video_info:
 388                     break
 389             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 390                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 391                 return
 392         if 'token' not in video_info:
 393             if 'reason' in video_info:
 394                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 395             else:
 396                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 397             return
 398
 399         # Check for "rental" videos
 400         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 401             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 402             return
 403
 404         # Start extracting information
 405         self.report_information_extraction(video_id)
 406
 407         # uploader
 408         if 'author' not in video_info:
 409             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 410             return
 411         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 412
 413         # uploader_id
 414         video_uploader_id = None
 415         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 416         if mobj is not None:
 417             video_uploader_id = mobj.group(1)
 418         else:
 419             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 420
 421         # title
 422         if 'title' not in video_info:
 423             self._downloader.trouble(u'ERROR: unable to extract video title')
 424             return
 425         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 426
 427         # thumbnail image
 428         if 'thumbnail_url' not in video_info:
 429             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 430             video_thumbnail = ''
 431         else:   # don't panic if we can't find it
 432             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 433
 434         # upload date
 435         upload_date = None
 436         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 437         if mobj is not None:
 438             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 439             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 440             for expression in format_expressions:
 441                 try:
 442                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 443                 except:
 444                     pass
 445
 446         # description
 447         video_description = get_element_by_id("eow-description", video_webpage)
 448         if video_description:
 449             video_description = clean_html(video_description)
 450         else:
 451             video_description = ''
 452
 453         # closed captions
 454         video_subtitles = None
 455         if self._downloader.params.get('writesubtitles', False):
 456             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 457             if srt_error:
 458                 self._downloader.trouble(srt_error)
 459
 460         if 'length_seconds' not in video_info:
 461             self._downloader.trouble(u'WARNING: unable to extract video duration')
 462             video_duration = ''
 463         else:
 464             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 465
 466         # token
 467         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 468
 469         # Decide which formats to download
 470         req_format = self._downloader.params.get('format', None)
 471
 472         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 473             self.report_rtmp_download()
 474             video_url_list = [(None, video_info['conn'][0])]
 475         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 476             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 477             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 478             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 479             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 480
 481             format_limit = self._downloader.params.get('format_limit', None)
 482             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 483             if format_limit is not None and format_limit in available_formats:
 484                 format_list = available_formats[available_formats.index(format_limit):]
 485             else:
 486                 format_list = available_formats
 487             existing_formats = [x for x in format_list if x in url_map]
 488             if len(existing_formats) == 0:
 489                 self._downloader.trouble(u'ERROR: no known formats available for video')
 490                 return
 491             if self._downloader.params.get('listformats', None):
 492                 self._print_formats(existing_formats)
 493                 return
 494             if req_format is None or req_format == 'best':
 495                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 496             elif req_format == 'worst':
 497                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 498             elif req_format in ('-1', 'all'):
 499                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 500             else:
 501                 # Specific formats. We pick the first in a slash-delimeted sequence.
 502                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 503                 req_formats = req_format.split('/')
 504                 video_url_list = None
 505                 for rf in req_formats:
 506                     if rf in url_map:
 507                         video_url_list = [(rf, url_map[rf])]
 508                         break
 509                 if video_url_list is None:
 510                     self._downloader.trouble(u'ERROR: requested format not available')
 511                     return
 512         else:
 513             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 514             return
 515
 516         results = []
 517         for format_param, video_real_url in video_url_list:
 518             # Extension
 519             video_extension = self._video_extensions.get(format_param, 'flv')
 520
 521             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 522                                               self._video_dimensions.get(format_param, '???'))
 523
 524             results.append({
 525                 'id':       video_id,
 526                 'url':      video_real_url,
 527                 'uploader': video_uploader,
 528                 'uploader_id': video_uploader_id,
 529                 'upload_date':  upload_date,
 530                 'title':    video_title,
 531                 'ext':      video_extension,
 532                 'format':   video_format,
 533                 'thumbnail':    video_thumbnail,
 534                 'description':  video_description,
 535                 'player_url':   player_url,
 536                 'subtitles':    video_subtitles,
 537                 'duration':     video_duration
 538             })
 539         return results
 540
 541
 542 class MetacafeIE(InfoExtractor):
 543     """Information Extractor for metacafe.com."""
 544
 545     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 546     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 547     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 548     IE_NAME = u'metacafe'
 549
 550     def __init__(self, downloader=None):
 551         InfoExtractor.__init__(self, downloader)
 552
 553     def report_disclaimer(self):
 554         """Report disclaimer retrieval."""
 555         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 556
 557     def report_age_confirmation(self):
 558         """Report attempt to confirm age."""
 559         self._downloader.to_screen(u'[metacafe] Confirming age')
 560
 561     def report_download_webpage(self, video_id):
 562         """Report webpage download."""
 563         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 564
 565     def report_extraction(self, video_id):
 566         """Report information extraction."""
 567         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 568
 569     def _real_initialize(self):
 570         # Retrieve disclaimer
 571         request = compat_urllib_request.Request(self._DISCLAIMER)
 572         try:
 573             self.report_disclaimer()
 574             disclaimer = compat_urllib_request.urlopen(request).read()
 575         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 576             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 577             return
 578
 579         # Confirm age
 580         disclaimer_form = {
 581             'filters': '0',
 582             'submit': "Continue - I'm over 18",
 583             }
 584         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 585         try:
 586             self.report_age_confirmation()
 587             disclaimer = compat_urllib_request.urlopen(request).read()
 588         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 589             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 590             return
 591
 592     def _real_extract(self, url):
 593         # Extract id and simplified title from URL
 594         mobj = re.match(self._VALID_URL, url)
 595         if mobj is None:
 596             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 597             return
 598
 599         video_id = mobj.group(1)
 600
 601         # Check if video comes from YouTube
 602         mobj2 = re.match(r'^yt-(.*)$', video_id)
 603         if mobj2 is not None:
 604             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 605             return
 606
 607         # Retrieve video webpage to extract further information
 608         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 609         try:
 610             self.report_download_webpage(video_id)
 611             webpage = compat_urllib_request.urlopen(request).read()
 612         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 613             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 614             return
 615
 616         # Extract URL, uploader and title from webpage
 617         self.report_extraction(video_id)
 618         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 619         if mobj is not None:
 620             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 621             video_extension = mediaURL[-3:]
 622
 623             # Extract gdaKey if available
 624             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 625             if mobj is None:
 626                 video_url = mediaURL
 627             else:
 628                 gdaKey = mobj.group(1)
 629                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 630         else:
 631             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 632             if mobj is None:
 633                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 634                 return
 635             vardict = compat_parse_qs(mobj.group(1))
 636             if 'mediaData' not in vardict:
 637                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 638                 return
 639             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 640             if mobj is None:
 641                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 642                 return
 643             mediaURL = mobj.group(1).replace('\\/', '/')
 644             video_extension = mediaURL[-3:]
 645             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 646
 647         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 648         if mobj is None:
 649             self._downloader.trouble(u'ERROR: unable to extract title')
 650             return
 651         video_title = mobj.group(1).decode('utf-8')
 652
 653         mobj = re.search(r'submitter=(.*?);', webpage)
 654         if mobj is None:
 655             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 656             return
 657         video_uploader = mobj.group(1)
 658
 659         return [{
 660             'id':       video_id.decode('utf-8'),
 661             'url':      video_url.decode('utf-8'),
 662             'uploader': video_uploader.decode('utf-8'),
 663             'upload_date':  None,
 664             'title':    video_title,
 665             'ext':      video_extension.decode('utf-8'),
 666         }]
 667
 668
 669 class DailymotionIE(InfoExtractor):
 670     """Information Extractor for Dailymotion"""
 671
 672     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 673     IE_NAME = u'dailymotion'
 674
 675     def __init__(self, downloader=None):
 676         InfoExtractor.__init__(self, downloader)
 677
 678     def report_extraction(self, video_id):
 679         """Report information extraction."""
 680         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 681
 682     def _real_extract(self, url):
 683         # Extract id and simplified title from URL
 684         mobj = re.match(self._VALID_URL, url)
 685         if mobj is None:
 686             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 687             return
 688
 689         video_id = mobj.group(1).split('_')[0].split('?')[0]
 690
 691         video_extension = 'mp4'
 692
 693         # Retrieve video webpage to extract further information
 694         request = compat_urllib_request.Request(url)
 695         request.add_header('Cookie', 'family_filter=off')
 696         webpage = self._download_webpage(request, video_id)
 697
 698         # Extract URL, uploader and title from webpage
 699         self.report_extraction(video_id)
 700         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 701         if mobj is None:
 702             self._downloader.trouble(u'ERROR: unable to extract media URL')
 703             return
 704         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 705
 706         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 707             if key in flashvars:
 708                 max_quality = key
 709                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 710                 break
 711         else:
 712             self._downloader.trouble(u'ERROR: unable to extract video URL')
 713             return
 714
 715         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 716         if mobj is None:
 717             self._downloader.trouble(u'ERROR: unable to extract video URL')
 718             return
 719
 720         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 721
 722         # TODO: support choosing qualities
 723
 724         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 725         if mobj is None:
 726             self._downloader.trouble(u'ERROR: unable to extract title')
 727             return
 728         video_title = unescapeHTML(mobj.group('title'))
 729
 730         video_uploader = None
 731         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 732         if mobj is None:
 733             # lookin for official user
 734             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 735             if mobj_official is None:
 736                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 737             else:
 738                 video_uploader = mobj_official.group(1)
 739         else:
 740             video_uploader = mobj.group(1)
 741
 742         video_upload_date = None
 743         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 744         if mobj is not None:
 745             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 746
 747         return [{
 748             'id':       video_id,
 749             'url':      video_url,
 750             'uploader': video_uploader,
 751             'upload_date':  video_upload_date,
 752             'title':    video_title,
 753             'ext':      video_extension,
 754         }]
 755
 756
 757 class PhotobucketIE(InfoExtractor):
 758     """Information extractor for photobucket.com."""
 759
 760     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 761     IE_NAME = u'photobucket'
 762
 763     def __init__(self, downloader=None):
 764         InfoExtractor.__init__(self, downloader)
 765
 766     def report_download_webpage(self, video_id):
 767         """Report webpage download."""
 768         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 769
 770     def report_extraction(self, video_id):
 771         """Report information extraction."""
 772         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 773
 774     def _real_extract(self, url):
 775         # Extract id from URL
 776         mobj = re.match(self._VALID_URL, url)
 777         if mobj is None:
 778             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 779             return
 780
 781         video_id = mobj.group(1)
 782
 783         video_extension = 'flv'
 784
 785         # Retrieve video webpage to extract further information
 786         request = compat_urllib_request.Request(url)
 787         try:
 788             self.report_download_webpage(video_id)
 789             webpage = compat_urllib_request.urlopen(request).read()
 790         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 791             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 792             return
 793
 794         # Extract URL, uploader, and title from webpage
 795         self.report_extraction(video_id)
 796         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 797         if mobj is None:
 798             self._downloader.trouble(u'ERROR: unable to extract media URL')
 799             return
 800         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 801
 802         video_url = mediaURL
 803
 804         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 805         if mobj is None:
 806             self._downloader.trouble(u'ERROR: unable to extract title')
 807             return
 808         video_title = mobj.group(1).decode('utf-8')
 809
 810         video_uploader = mobj.group(2).decode('utf-8')
 811
 812         return [{
 813             'id':       video_id.decode('utf-8'),
 814             'url':      video_url.decode('utf-8'),
 815             'uploader': video_uploader,
 816             'upload_date':  None,
 817             'title':    video_title,
 818             'ext':      video_extension.decode('utf-8'),
 819         }]
 820
 821
 822 class YahooIE(InfoExtractor):
 823     """Information extractor for video.yahoo.com."""
 824
 825     _WORKING = False
 826     # _VALID_URL matches all Yahoo! Video URLs
 827     # _VPAGE_URL matches only the extractable '/watch/' URLs
 828     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 829     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 830     IE_NAME = u'video.yahoo'
 831
 832     def __init__(self, downloader=None):
 833         InfoExtractor.__init__(self, downloader)
 834
 835     def report_download_webpage(self, video_id):
 836         """Report webpage download."""
 837         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 838
 839     def report_extraction(self, video_id):
 840         """Report information extraction."""
 841         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 842
 843     def _real_extract(self, url, new_video=True):
 844         # Extract ID from URL
 845         mobj = re.match(self._VALID_URL, url)
 846         if mobj is None:
 847             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 848             return
 849
 850         video_id = mobj.group(2)
 851         video_extension = 'flv'
 852
 853         # Rewrite valid but non-extractable URLs as
 854         # extractable English language /watch/ URLs
 855         if re.match(self._VPAGE_URL, url) is None:
 856             request = compat_urllib_request.Request(url)
 857             try:
 858                 webpage = compat_urllib_request.urlopen(request).read()
 859             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 860                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 861                 return
 862
 863             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 864             if mobj is None:
 865                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 866                 return
 867             yahoo_id = mobj.group(1)
 868
 869             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 870             if mobj is None:
 871                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 872                 return
 873             yahoo_vid = mobj.group(1)
 874
 875             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 876             return self._real_extract(url, new_video=False)
 877
 878         # Retrieve video webpage to extract further information
 879         request = compat_urllib_request.Request(url)
 880         try:
 881             self.report_download_webpage(video_id)
 882             webpage = compat_urllib_request.urlopen(request).read()
 883         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 884             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 885             return
 886
 887         # Extract uploader and title from webpage
 888         self.report_extraction(video_id)
 889         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 890         if mobj is None:
 891             self._downloader.trouble(u'ERROR: unable to extract video title')
 892             return
 893         video_title = mobj.group(1).decode('utf-8')
 894
 895         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 896         if mobj is None:
 897             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 898             return
 899         video_uploader = mobj.group(1).decode('utf-8')
 900
 901         # Extract video thumbnail
 902         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 903         if mobj is None:
 904             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 905             return
 906         video_thumbnail = mobj.group(1).decode('utf-8')
 907
 908         # Extract video description
 909         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 910         if mobj is None:
 911             self._downloader.trouble(u'ERROR: unable to extract video description')
 912             return
 913         video_description = mobj.group(1).decode('utf-8')
 914         if not video_description:
 915             video_description = 'No description available.'
 916
 917         # Extract video height and width
 918         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 919         if mobj is None:
 920             self._downloader.trouble(u'ERROR: unable to extract video height')
 921             return
 922         yv_video_height = mobj.group(1)
 923
 924         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 925         if mobj is None:
 926             self._downloader.trouble(u'ERROR: unable to extract video width')
 927             return
 928         yv_video_width = mobj.group(1)
 929
 930         # Retrieve video playlist to extract media URL
 931         # I'm not completely sure what all these options are, but we
 932         # seem to need most of them, otherwise the server sends a 401.
 933         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 934         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 935         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 936                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 937                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 938         try:
 939             self.report_download_webpage(video_id)
 940             webpage = compat_urllib_request.urlopen(request).read()
 941         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 942             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 943             return
 944
 945         # Extract media URL from playlist XML
 946         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 947         if mobj is None:
 948             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 949             return
 950         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 951         video_url = unescapeHTML(video_url)
 952
 953         return [{
 954             'id':       video_id.decode('utf-8'),
 955             'url':      video_url,
 956             'uploader': video_uploader,
 957             'upload_date':  None,
 958             'title':    video_title,
 959             'ext':      video_extension.decode('utf-8'),
 960             'thumbnail':    video_thumbnail.decode('utf-8'),
 961             'description':  video_description,
 962         }]
 963
 964
 965 class VimeoIE(InfoExtractor):
 966     """Information extractor for vimeo.com."""
 967
 968     # _VALID_URL matches Vimeo URLs
 969     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 970     IE_NAME = u'vimeo'
 971
 972     def __init__(self, downloader=None):
 973         InfoExtractor.__init__(self, downloader)
 974
 975     def report_download_webpage(self, video_id):
 976         """Report webpage download."""
 977         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 978
 979     def report_extraction(self, video_id):
 980         """Report information extraction."""
 981         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 982
 983     def _real_extract(self, url, new_video=True):
 984         # Extract ID from URL
 985         mobj = re.match(self._VALID_URL, url)
 986         if mobj is None:
 987             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 988             return
 989
 990         video_id = mobj.group(1)
 991
 992         # Retrieve video webpage to extract further information
 993         request = compat_urllib_request.Request(url, None, std_headers)
 994         try:
 995             self.report_download_webpage(video_id)
 996             webpage_bytes = compat_urllib_request.urlopen(request).read()
 997             webpage = webpage_bytes.decode('utf-8')
 998         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 999             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1000             return
1001
1002         # Now we begin extracting as much information as we can from what we
1003         # retrieved. First we extract the information common to all extractors,
1004         # and latter we extract those that are Vimeo specific.
1005         self.report_extraction(video_id)
1006
1007         # Extract the config JSON
1008         try:
1009             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1010             config = json.loads(config)
1011         except:
1012             self._downloader.trouble(u'ERROR: unable to extract info section')
1013             return
1014
1015         # Extract title
1016         video_title = config["video"]["title"]
1017
1018         # Extract uploader and uploader_id
1019         video_uploader = config["video"]["owner"]["name"]
1020         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1021
1022         # Extract video thumbnail
1023         video_thumbnail = config["video"]["thumbnail"]
1024
1025         # Extract video description
1026         video_description = get_element_by_attribute("itemprop", "description", webpage)
1027         if video_description: video_description = clean_html(video_description)
1028         else: video_description = ''
1029
1030         # Extract upload date
1031         video_upload_date = None
1032         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1033         if mobj is not None:
1034             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1035
1036         # Vimeo specific: extract request signature and timestamp
1037         sig = config['request']['signature']
1038         timestamp = config['request']['timestamp']
1039
1040         # Vimeo specific: extract video codec and quality information
1041         # First consider quality, then codecs, then take everything
1042         # TODO bind to format param
1043         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1044         files = { 'hd': [], 'sd': [], 'other': []}
1045         for codec_name, codec_extension in codecs:
1046             if codec_name in config["video"]["files"]:
1047                 if 'hd' in config["video"]["files"][codec_name]:
1048                     files['hd'].append((codec_name, codec_extension, 'hd'))
1049                 elif 'sd' in config["video"]["files"][codec_name]:
1050                     files['sd'].append((codec_name, codec_extension, 'sd'))
1051                 else:
1052                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1053
1054         for quality in ('hd', 'sd', 'other'):
1055             if len(files[quality]) > 0:
1056                 video_quality = files[quality][0][2]
1057                 video_codec = files[quality][0][0]
1058                 video_extension = files[quality][0][1]
1059                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1060                 break
1061         else:
1062             self._downloader.trouble(u'ERROR: no known codec found')
1063             return
1064
1065         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1066                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1067
1068         return [{
1069             'id':       video_id,
1070             'url':      video_url,
1071             'uploader': video_uploader,
1072             'uploader_id': video_uploader_id,
1073             'upload_date':  video_upload_date,
1074             'title':    video_title,
1075             'ext':      video_extension,
1076             'thumbnail':    video_thumbnail,
1077             'description':  video_description,
1078         }]
1079
1080
1081 class ArteTvIE(InfoExtractor):
1082     """arte.tv information extractor."""
1083
1084     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1085     _LIVE_URL = r'index-[0-9]+\.html$'
1086
1087     IE_NAME = u'arte.tv'
1088
1089     def __init__(self, downloader=None):
1090         InfoExtractor.__init__(self, downloader)
1091
1092     def report_download_webpage(self, video_id):
1093         """Report webpage download."""
1094         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1095
1096     def report_extraction(self, video_id):
1097         """Report information extraction."""
1098         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1099
1100     def fetch_webpage(self, url):
1101         request = compat_urllib_request.Request(url)
1102         try:
1103             self.report_download_webpage(url)
1104             webpage = compat_urllib_request.urlopen(request).read()
1105         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1106             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1107             return
1108         except ValueError as err:
1109             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1110             return
1111         return webpage
1112
1113     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1114         page = self.fetch_webpage(url)
1115         mobj = re.search(regex, page, regexFlags)
1116         info = {}
1117
1118         if mobj is None:
1119             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1120             return
1121
1122         for (i, key, err) in matchTuples:
1123             if mobj.group(i) is None:
1124                 self._downloader.trouble(err)
1125                 return
1126             else:
1127                 info[key] = mobj.group(i)
1128
1129         return info
1130
1131     def extractLiveStream(self, url):
1132         video_lang = url.split('/')[-4]
1133         info = self.grep_webpage(
1134             url,
1135             r'src="(.*?/videothek_js.*?\.js)',
1136             0,
1137             [
1138                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1139             ]
1140         )
1141         http_host = url.split('/')[2]
1142         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1143         info = self.grep_webpage(
1144             next_url,
1145             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1146                 '(http://.*?\.swf).*?' +
1147                 '(rtmp://.*?)\'',
1148             re.DOTALL,
1149             [
1150                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1151                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1152                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1153             ]
1154         )
1155         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1156
1157     def extractPlus7Stream(self, url):
1158         video_lang = url.split('/')[-3]
1159         info = self.grep_webpage(
1160             url,
1161             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1162             0,
1163             [
1164                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1165             ]
1166         )
1167         next_url = compat_urllib_parse.unquote(info.get('url'))
1168         info = self.grep_webpage(
1169             next_url,
1170             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1171             0,
1172             [
1173                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1174             ]
1175         )
1176         next_url = compat_urllib_parse.unquote(info.get('url'))
1177
1178         info = self.grep_webpage(
1179             next_url,
1180             r'<video id="(.*?)".*?>.*?' +
1181                 '<name>(.*?)</name>.*?' +
1182                 '<dateVideo>(.*?)</dateVideo>.*?' +
1183                 '<url quality="hd">(.*?)</url>',
1184             re.DOTALL,
1185             [
1186                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1187                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1188                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1189                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1190             ]
1191         )
1192
1193         return {
1194             'id':           info.get('id'),
1195             'url':          compat_urllib_parse.unquote(info.get('url')),
1196             'uploader':     u'arte.tv',
1197             'upload_date':  info.get('date'),
1198             'title':        info.get('title').decode('utf-8'),
1199             'ext':          u'mp4',
1200             'format':       u'NA',
1201             'player_url':   None,
1202         }
1203
1204     def _real_extract(self, url):
1205         video_id = url.split('/')[-1]
1206         self.report_extraction(video_id)
1207
1208         if re.search(self._LIVE_URL, video_id) is not None:
1209             self.extractLiveStream(url)
1210             return
1211         else:
1212             info = self.extractPlus7Stream(url)
1213
1214         return [info]
1215
1216
1217 class GenericIE(InfoExtractor):
1218     """Generic last-resort information extractor."""
1219
1220     _VALID_URL = r'.*'
1221     IE_NAME = u'generic'
1222
1223     def __init__(self, downloader=None):
1224         InfoExtractor.__init__(self, downloader)
1225
1226     def report_download_webpage(self, video_id):
1227         """Report webpage download."""
1228         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1229         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1230
1231     def report_extraction(self, video_id):
1232         """Report information extraction."""
1233         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1234
1235     def report_following_redirect(self, new_url):
1236         """Report information extraction."""
1237         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1238
1239     def _test_redirect(self, url):
1240         """Check if it is a redirect, like url shorteners, in case restart chain."""
1241         class HeadRequest(compat_urllib_request.Request):
1242             def get_method(self):
1243                 return "HEAD"
1244
1245         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1246             """
1247             Subclass the HTTPRedirectHandler to make it use our
1248             HeadRequest also on the redirected URL
1249             """
1250             def redirect_request(self, req, fp, code, msg, headers, newurl):
1251                 if code in (301, 302, 303, 307):
1252                     newurl = newurl.replace(' ', '%20')
1253                     newheaders = dict((k,v) for k,v in req.headers.items()
1254                                       if k.lower() not in ("content-length", "content-type"))
1255                     return HeadRequest(newurl,
1256                                        headers=newheaders,
1257                                        origin_req_host=req.get_origin_req_host(),
1258                                        unverifiable=True)
1259                 else:
1260                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1261
1262         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1263             """
1264             Fallback to GET if HEAD is not allowed (405 HTTP error)
1265             """
1266             def http_error_405(self, req, fp, code, msg, headers):
1267                 fp.read()
1268                 fp.close()
1269
1270                 newheaders = dict((k,v) for k,v in req.headers.items()
1271                                   if k.lower() not in ("content-length", "content-type"))
1272                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1273                                                  headers=newheaders,
1274                                                  origin_req_host=req.get_origin_req_host(),
1275                                                  unverifiable=True))
1276
1277         # Build our opener
1278         opener = compat_urllib_request.OpenerDirector()
1279         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1280                         HTTPMethodFallback, HEADRedirectHandler,
1281                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1282             opener.add_handler(handler())
1283
1284         response = opener.open(HeadRequest(url))
1285         new_url = response.geturl()
1286
1287         if url == new_url:
1288             return False
1289
1290         self.report_following_redirect(new_url)
1291         self._downloader.download([new_url])
1292         return True
1293
1294     def _real_extract(self, url):
1295         if self._test_redirect(url): return
1296
1297         video_id = url.split('/')[-1]
1298         request = compat_urllib_request.Request(url)
1299         try:
1300             self.report_download_webpage(video_id)
1301             webpage = compat_urllib_request.urlopen(request).read()
1302         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1303             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1304             return
1305         except ValueError as err:
1306             # since this is the last-resort InfoExtractor, if
1307             # this error is thrown, it'll be thrown here
1308             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1309             return
1310
1311         self.report_extraction(video_id)
1312         # Start with something easy: JW Player in SWFObject
1313         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1314         if mobj is None:
1315             # Broaden the search a little bit
1316             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1317         if mobj is None:
1318             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1319             return
1320
1321         # It's possible that one of the regexes
1322         # matched, but returned an empty group:
1323         if mobj.group(1) is None:
1324             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1325             return
1326
1327         video_url = compat_urllib_parse.unquote(mobj.group(1))
1328         video_id = os.path.basename(video_url)
1329
1330         # here's a fun little line of code for you:
1331         video_extension = os.path.splitext(video_id)[1][1:]
1332         video_id = os.path.splitext(video_id)[0]
1333
1334         # it's tempting to parse this further, but you would
1335         # have to take into account all the variations like
1336         #   Video Title - Site Name
1337         #   Site Name | Video Title
1338         #   Video Title - Tagline | Site Name
1339         # and so on and so forth; it's just not practical
1340         mobj = re.search(r'<title>(.*)</title>', webpage)
1341         if mobj is None:
1342             self._downloader.trouble(u'ERROR: unable to extract title')
1343             return
1344         video_title = mobj.group(1)
1345
1346         # video uploader is domain name
1347         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1348         if mobj is None:
1349             self._downloader.trouble(u'ERROR: unable to extract title')
1350             return
1351         video_uploader = mobj.group(1)
1352
1353         return [{
1354             'id':       video_id,
1355             'url':      video_url,
1356             'uploader': video_uploader,
1357             'upload_date':  None,
1358             'title':    video_title,
1359             'ext':      video_extension,
1360         }]
1361
1362
1363 class YoutubeSearchIE(InfoExtractor):
1364     """Information Extractor for YouTube search queries."""
1365     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1366     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1367     _max_youtube_results = 1000
1368     IE_NAME = u'youtube:search'
1369
1370     def __init__(self, downloader=None):
1371         InfoExtractor.__init__(self, downloader)
1372
1373     def report_download_page(self, query, pagenum):
1374         """Report attempt to download search page with given number."""
1375         query = query.decode(preferredencoding())
1376         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1377
1378     def _real_extract(self, query):
1379         mobj = re.match(self._VALID_URL, query)
1380         if mobj is None:
1381             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1382             return
1383
1384         prefix, query = query.split(':')
1385         prefix = prefix[8:]
1386         query = query.encode('utf-8')
1387         if prefix == '':
1388             self._download_n_results(query, 1)
1389             return
1390         elif prefix == 'all':
1391             self._download_n_results(query, self._max_youtube_results)
1392             return
1393         else:
1394             try:
1395                 n = int(prefix)
1396                 if n <= 0:
1397                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1398                     return
1399                 elif n > self._max_youtube_results:
1400                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1401                     n = self._max_youtube_results
1402                 self._download_n_results(query, n)
1403                 return
1404             except ValueError: # parsing prefix as integer fails
1405                 self._download_n_results(query, 1)
1406                 return
1407
1408     def _download_n_results(self, query, n):
1409         """Downloads a specified number of results for a query"""
1410
1411         video_ids = []
1412         pagenum = 0
1413         limit = n
1414
1415         while (50 * pagenum) < limit:
1416             self.report_download_page(query, pagenum+1)
1417             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1418             request = compat_urllib_request.Request(result_url)
1419             try:
1420                 data = compat_urllib_request.urlopen(request).read()
1421             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1422                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1423                 return
1424             api_response = json.loads(data)['data']
1425
1426             new_ids = list(video['id'] for video in api_response['items'])
1427             video_ids += new_ids
1428
1429             limit = min(n, api_response['totalItems'])
1430             pagenum += 1
1431
1432         if len(video_ids) > n:
1433             video_ids = video_ids[:n]
1434         for id in video_ids:
1435             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1436         return
1437
1438
1439 class GoogleSearchIE(InfoExtractor):
1440     """Information Extractor for Google Video search queries."""
1441     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1442     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1443     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1444     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1445     _max_google_results = 1000
1446     IE_NAME = u'video.google:search'
1447
1448     def __init__(self, downloader=None):
1449         InfoExtractor.__init__(self, downloader)
1450
1451     def report_download_page(self, query, pagenum):
1452         """Report attempt to download playlist page with given number."""
1453         query = query.decode(preferredencoding())
1454         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1455
1456     def _real_extract(self, query):
1457         mobj = re.match(self._VALID_URL, query)
1458         if mobj is None:
1459             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1460             return
1461
1462         prefix, query = query.split(':')
1463         prefix = prefix[8:]
1464         query = query.encode('utf-8')
1465         if prefix == '':
1466             self._download_n_results(query, 1)
1467             return
1468         elif prefix == 'all':
1469             self._download_n_results(query, self._max_google_results)
1470             return
1471         else:
1472             try:
1473                 n = int(prefix)
1474                 if n <= 0:
1475                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1476                     return
1477                 elif n > self._max_google_results:
1478                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1479                     n = self._max_google_results
1480                 self._download_n_results(query, n)
1481                 return
1482             except ValueError: # parsing prefix as integer fails
1483                 self._download_n_results(query, 1)
1484                 return
1485
1486     def _download_n_results(self, query, n):
1487         """Downloads a specified number of results for a query"""
1488
1489         video_ids = []
1490         pagenum = 0
1491
1492         while True:
1493             self.report_download_page(query, pagenum)
1494             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1495             request = compat_urllib_request.Request(result_url)
1496             try:
1497                 page = compat_urllib_request.urlopen(request).read()
1498             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1499                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1500                 return
1501
1502             # Extract video identifiers
1503             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1504                 video_id = mobj.group(1)
1505                 if video_id not in video_ids:
1506                     video_ids.append(video_id)
1507                     if len(video_ids) == n:
1508                         # Specified n videos reached
1509                         for id in video_ids:
1510                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1511                         return
1512
1513             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1514                 for id in video_ids:
1515                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1516                 return
1517
1518             pagenum = pagenum + 1
1519
1520
1521 class YahooSearchIE(InfoExtractor):
1522     """Information Extractor for Yahoo! Video search queries."""
1523
1524     _WORKING = False
1525     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1526     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1527     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1528     _MORE_PAGES_INDICATOR = r'\s*Next'
1529     _max_yahoo_results = 1000
1530     IE_NAME = u'video.yahoo:search'
1531
1532     def __init__(self, downloader=None):
1533         InfoExtractor.__init__(self, downloader)
1534
1535     def report_download_page(self, query, pagenum):
1536         """Report attempt to download playlist page with given number."""
1537         query = query.decode(preferredencoding())
1538         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1539
1540     def _real_extract(self, query):
1541         mobj = re.match(self._VALID_URL, query)
1542         if mobj is None:
1543             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1544             return
1545
1546         prefix, query = query.split(':')
1547         prefix = prefix[8:]
1548         query = query.encode('utf-8')
1549         if prefix == '':
1550             self._download_n_results(query, 1)
1551             return
1552         elif prefix == 'all':
1553             self._download_n_results(query, self._max_yahoo_results)
1554             return
1555         else:
1556             try:
1557                 n = int(prefix)
1558                 if n <= 0:
1559                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1560                     return
1561                 elif n > self._max_yahoo_results:
1562                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1563                     n = self._max_yahoo_results
1564                 self._download_n_results(query, n)
1565                 return
1566             except ValueError: # parsing prefix as integer fails
1567                 self._download_n_results(query, 1)
1568                 return
1569
1570     def _download_n_results(self, query, n):
1571         """Downloads a specified number of results for a query"""
1572
1573         video_ids = []
1574         already_seen = set()
1575         pagenum = 1
1576
1577         while True:
1578             self.report_download_page(query, pagenum)
1579             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1580             request = compat_urllib_request.Request(result_url)
1581             try:
1582                 page = compat_urllib_request.urlopen(request).read()
1583             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1584                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1585                 return
1586
1587             # Extract video identifiers
1588             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1589                 video_id = mobj.group(1)
1590                 if video_id not in already_seen:
1591                     video_ids.append(video_id)
1592                     already_seen.add(video_id)
1593                     if len(video_ids) == n:
1594                         # Specified n videos reached
1595                         for id in video_ids:
1596                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1597                         return
1598
1599             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1600                 for id in video_ids:
1601                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1602                 return
1603
1604             pagenum = pagenum + 1
1605
1606
1607 class YoutubePlaylistIE(InfoExtractor):
1608     """Information Extractor for YouTube playlists."""
1609
1610     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1611     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1612     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1613     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1614     IE_NAME = u'youtube:playlist'
1615
1616     def __init__(self, downloader=None):
1617         InfoExtractor.__init__(self, downloader)
1618
1619     def report_download_page(self, playlist_id, pagenum):
1620         """Report attempt to download playlist page with given number."""
1621         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1622
1623     def _real_extract(self, url):
1624         # Extract playlist id
1625         mobj = re.match(self._VALID_URL, url)
1626         if mobj is None:
1627             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1628             return
1629
1630         # Single video case
1631         if mobj.group(3) is not None:
1632             self._downloader.download([mobj.group(3)])
1633             return
1634
1635         # Download playlist pages
1636         # prefix is 'p' as default for playlists but there are other types that need extra care
1637         playlist_prefix = mobj.group(1)
1638         if playlist_prefix == 'a':
1639             playlist_access = 'artist'
1640         else:
1641             playlist_prefix = 'p'
1642             playlist_access = 'view_play_list'
1643         playlist_id = mobj.group(2)
1644         video_ids = []
1645         pagenum = 1
1646
1647         while True:
1648             self.report_download_page(playlist_id, pagenum)
1649             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1650             request = compat_urllib_request.Request(url)
1651             try:
1652                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1653             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1654                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1655                 return
1656
1657             # Extract video identifiers
1658             ids_in_page = []
1659             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1660                 if mobj.group(1) not in ids_in_page:
1661                     ids_in_page.append(mobj.group(1))
1662             video_ids.extend(ids_in_page)
1663
1664             if self._MORE_PAGES_INDICATOR not in page:
1665                 break
1666             pagenum = pagenum + 1
1667
1668         total = len(video_ids)
1669
1670         playliststart = self._downloader.params.get('playliststart', 1) - 1
1671         playlistend = self._downloader.params.get('playlistend', -1)
1672         if playlistend == -1:
1673             video_ids = video_ids[playliststart:]
1674         else:
1675             video_ids = video_ids[playliststart:playlistend]
1676
1677         if len(video_ids) == total:
1678             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1679         else:
1680             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1681
1682         for id in video_ids:
1683             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1684         return
1685
1686
1687 class YoutubeChannelIE(InfoExtractor):
1688     """Information Extractor for YouTube channels."""
1689
1690     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1691     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1692     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1693     IE_NAME = u'youtube:channel'
1694
1695     def report_download_page(self, channel_id, pagenum):
1696         """Report attempt to download channel page with given number."""
1697         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1698
1699     def _real_extract(self, url):
1700         # Extract channel id
1701         mobj = re.match(self._VALID_URL, url)
1702         if mobj is None:
1703             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1704             return
1705
1706         # Download channel pages
1707         channel_id = mobj.group(1)
1708         video_ids = []
1709         pagenum = 1
1710
1711         while True:
1712             self.report_download_page(channel_id, pagenum)
1713             url = self._TEMPLATE_URL % (channel_id, pagenum)
1714             request = compat_urllib_request.Request(url)
1715             try:
1716                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1717             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1718                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1719                 return
1720
1721             # Extract video identifiers
1722             ids_in_page = []
1723             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1724                 if mobj.group(1) not in ids_in_page:
1725                     ids_in_page.append(mobj.group(1))
1726             video_ids.extend(ids_in_page)
1727
1728             if self._MORE_PAGES_INDICATOR not in page:
1729                 break
1730             pagenum = pagenum + 1
1731
1732         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1733
1734         for id in video_ids:
1735             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1736         return
1737
1738
1739 class YoutubeUserIE(InfoExtractor):
1740     """Information Extractor for YouTube users."""
1741
1742     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1743     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1744     _GDATA_PAGE_SIZE = 50
1745     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1746     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1747     IE_NAME = u'youtube:user'
1748
1749     def __init__(self, downloader=None):
1750         InfoExtractor.__init__(self, downloader)
1751
1752     def report_download_page(self, username, start_index):
1753         """Report attempt to download user page."""
1754         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1755                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1756
1757     def _real_extract(self, url):
1758         # Extract username
1759         mobj = re.match(self._VALID_URL, url)
1760         if mobj is None:
1761             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1762             return
1763
1764         username = mobj.group(1)
1765
1766         # Download video ids using YouTube Data API. Result size per
1767         # query is limited (currently to 50 videos) so we need to query
1768         # page by page until there are no video ids - it means we got
1769         # all of them.
1770
1771         video_ids = []
1772         pagenum = 0
1773
1774         while True:
1775             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1776             self.report_download_page(username, start_index)
1777
1778             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1779
1780             try:
1781                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1782             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1783                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1784                 return
1785
1786             # Extract video identifiers
1787             ids_in_page = []
1788
1789             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1790                 if mobj.group(1) not in ids_in_page:
1791                     ids_in_page.append(mobj.group(1))
1792
1793             video_ids.extend(ids_in_page)
1794
1795             # A little optimization - if current page is not
1796             # "full", ie. does not contain PAGE_SIZE video ids then
1797             # we can assume that this page is the last one - there
1798             # are no more ids on further pages - no need to query
1799             # again.
1800
1801             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1802                 break
1803
1804             pagenum += 1
1805
1806         all_ids_count = len(video_ids)
1807         playliststart = self._downloader.params.get('playliststart', 1) - 1
1808         playlistend = self._downloader.params.get('playlistend', -1)
1809
1810         if playlistend == -1:
1811             video_ids = video_ids[playliststart:]
1812         else:
1813             video_ids = video_ids[playliststart:playlistend]
1814
1815         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1816                 (username, all_ids_count, len(video_ids)))
1817
1818         for video_id in video_ids:
1819             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1820
1821
1822 class BlipTVUserIE(InfoExtractor):
1823     """Information Extractor for blip.tv users."""
1824
1825     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1826     _PAGE_SIZE = 12
1827     IE_NAME = u'blip.tv:user'
1828
1829     def __init__(self, downloader=None):
1830         InfoExtractor.__init__(self, downloader)
1831
1832     def report_download_page(self, username, pagenum):
1833         """Report attempt to download user page."""
1834         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1835                 (self.IE_NAME, username, pagenum))
1836
1837     def _real_extract(self, url):
1838         # Extract username
1839         mobj = re.match(self._VALID_URL, url)
1840         if mobj is None:
1841             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1842             return
1843
1844         username = mobj.group(1)
1845
1846         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1847
1848         request = compat_urllib_request.Request(url)
1849
1850         try:
1851             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1852             mobj = re.search(r'data-users-id="([^"]+)"', page)
1853             page_base = page_base % mobj.group(1)
1854         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1855             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1856             return
1857
1858
1859         # Download video ids using BlipTV Ajax calls. Result size per
1860         # query is limited (currently to 12 videos) so we need to query
1861         # page by page until there are no video ids - it means we got
1862         # all of them.
1863
1864         video_ids = []
1865         pagenum = 1
1866
1867         while True:
1868             self.report_download_page(username, pagenum)
1869
1870             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1871
1872             try:
1873                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1874             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1875                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1876                 return
1877
1878             # Extract video identifiers
1879             ids_in_page = []
1880
1881             for mobj in re.finditer(r'href="/([^"]+)"', page):
1882                 if mobj.group(1) not in ids_in_page:
1883                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1884
1885             video_ids.extend(ids_in_page)
1886
1887             # A little optimization - if current page is not
1888             # "full", ie. does not contain PAGE_SIZE video ids then
1889             # we can assume that this page is the last one - there
1890             # are no more ids on further pages - no need to query
1891             # again.
1892
1893             if len(ids_in_page) < self._PAGE_SIZE:
1894                 break
1895
1896             pagenum += 1
1897
1898         all_ids_count = len(video_ids)
1899         playliststart = self._downloader.params.get('playliststart', 1) - 1
1900         playlistend = self._downloader.params.get('playlistend', -1)
1901
1902         if playlistend == -1:
1903             video_ids = video_ids[playliststart:]
1904         else:
1905             video_ids = video_ids[playliststart:playlistend]
1906
1907         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1908                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1909
1910         for video_id in video_ids:
1911             self._downloader.download([u'http://blip.tv/'+video_id])
1912
1913
1914 class DepositFilesIE(InfoExtractor):
1915     """Information extractor for depositfiles.com"""
1916
1917     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1918
1919     def report_download_webpage(self, file_id):
1920         """Report webpage download."""
1921         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1922
1923     def report_extraction(self, file_id):
1924         """Report information extraction."""
1925         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1926
1927     def _real_extract(self, url):
1928         file_id = url.split('/')[-1]
1929         # Rebuild url in english locale
1930         url = 'http://depositfiles.com/en/files/' + file_id
1931
1932         # Retrieve file webpage with 'Free download' button pressed
1933         free_download_indication = { 'gateway_result' : '1' }
1934         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1935         try:
1936             self.report_download_webpage(file_id)
1937             webpage = compat_urllib_request.urlopen(request).read()
1938         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1939             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1940             return
1941
1942         # Search for the real file URL
1943         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1944         if (mobj is None) or (mobj.group(1) is None):
1945             # Try to figure out reason of the error.
1946             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1947             if (mobj is not None) and (mobj.group(1) is not None):
1948                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1949                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1950             else:
1951                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1952             return
1953
1954         file_url = mobj.group(1)
1955         file_extension = os.path.splitext(file_url)[1][1:]
1956
1957         # Search for file title
1958         mobj = re.search(r'<b title="(.*?)">', webpage)
1959         if mobj is None:
1960             self._downloader.trouble(u'ERROR: unable to extract title')
1961             return
1962         file_title = mobj.group(1).decode('utf-8')
1963
1964         return [{
1965             'id':       file_id.decode('utf-8'),
1966             'url':      file_url.decode('utf-8'),
1967             'uploader': None,
1968             'upload_date':  None,
1969             'title':    file_title,
1970             'ext':      file_extension.decode('utf-8'),
1971         }]
1972
1973
1974 class FacebookIE(InfoExtractor):
1975     """Information Extractor for Facebook"""
1976
1977     _WORKING = False
1978     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1979     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1980     _NETRC_MACHINE = 'facebook'
1981     _available_formats = ['video', 'highqual', 'lowqual']
1982     _video_extensions = {
1983         'video': 'mp4',
1984         'highqual': 'mp4',
1985         'lowqual': 'mp4',
1986     }
1987     IE_NAME = u'facebook'
1988
1989     def __init__(self, downloader=None):
1990         InfoExtractor.__init__(self, downloader)
1991
1992     def _reporter(self, message):
1993         """Add header and report message."""
1994         self._downloader.to_screen(u'[facebook] %s' % message)
1995
1996     def report_login(self):
1997         """Report attempt to log in."""
1998         self._reporter(u'Logging in')
1999
2000     def report_video_webpage_download(self, video_id):
2001         """Report attempt to download video webpage."""
2002         self._reporter(u'%s: Downloading video webpage' % video_id)
2003
2004     def report_information_extraction(self, video_id):
2005         """Report attempt to extract video information."""
2006         self._reporter(u'%s: Extracting video information' % video_id)
2007
2008     def _parse_page(self, video_webpage):
2009         """Extract video information from page"""
2010         # General data
2011         data = {'title': r'\("video_title", "(.*?)"\)',
2012             'description': r'<div class="datawrap">(.*?)</div>',
2013             'owner': r'\("video_owner_name", "(.*?)"\)',
2014             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2015             }
2016         video_info = {}
2017         for piece in data.keys():
2018             mobj = re.search(data[piece], video_webpage)
2019             if mobj is not None:
2020                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2021
2022         # Video urls
2023         video_urls = {}
2024         for fmt in self._available_formats:
2025             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2026             if mobj is not None:
2027                 # URL is in a Javascript segment inside an escaped Unicode format within
2028                 # the generally utf-8 page
2029                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2030         video_info['video_urls'] = video_urls
2031
2032         return video_info
2033
2034     def _real_initialize(self):
2035         if self._downloader is None:
2036             return
2037
2038         useremail = None
2039         password = None
2040         downloader_params = self._downloader.params
2041
2042         # Attempt to use provided username and password or .netrc data
2043         if downloader_params.get('username', None) is not None:
2044             useremail = downloader_params['username']
2045             password = downloader_params['password']
2046         elif downloader_params.get('usenetrc', False):
2047             try:
2048                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2049                 if info is not None:
2050                     useremail = info[0]
2051                     password = info[2]
2052                 else:
2053                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2054             except (IOError, netrc.NetrcParseError) as err:
2055                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2056                 return
2057
2058         if useremail is None:
2059             return
2060
2061         # Log in
2062         login_form = {
2063             'email': useremail,
2064             'pass': password,
2065             'login': 'Log+In'
2066             }
2067         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2068         try:
2069             self.report_login()
2070             login_results = compat_urllib_request.urlopen(request).read()
2071             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2072                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2073                 return
2074         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2075             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2076             return
2077
2078     def _real_extract(self, url):
2079         mobj = re.match(self._VALID_URL, url)
2080         if mobj is None:
2081             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2082             return
2083         video_id = mobj.group('ID')
2084
2085         # Get video webpage
2086         self.report_video_webpage_download(video_id)
2087         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2088         try:
2089             page = compat_urllib_request.urlopen(request)
2090             video_webpage = page.read()
2091         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2092             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2093             return
2094
2095         # Start extracting information
2096         self.report_information_extraction(video_id)
2097
2098         # Extract information
2099         video_info = self._parse_page(video_webpage)
2100
2101         # uploader
2102         if 'owner' not in video_info:
2103             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2104             return
2105         video_uploader = video_info['owner']
2106
2107         # title
2108         if 'title' not in video_info:
2109             self._downloader.trouble(u'ERROR: unable to extract video title')
2110             return
2111         video_title = video_info['title']
2112         video_title = video_title.decode('utf-8')
2113
2114         # thumbnail image
2115         if 'thumbnail' not in video_info:
2116             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2117             video_thumbnail = ''
2118         else:
2119             video_thumbnail = video_info['thumbnail']
2120
2121         # upload date
2122         upload_date = None
2123         if 'upload_date' in video_info:
2124             upload_time = video_info['upload_date']
2125             timetuple = email.utils.parsedate_tz(upload_time)
2126             if timetuple is not None:
2127                 try:
2128                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2129                 except:
2130                     pass
2131
2132         # description
2133         video_description = video_info.get('description', 'No description available.')
2134
2135         url_map = video_info['video_urls']
2136         if url_map:
2137             # Decide which formats to download
2138             req_format = self._downloader.params.get('format', None)
2139             format_limit = self._downloader.params.get('format_limit', None)
2140
2141             if format_limit is not None and format_limit in self._available_formats:
2142                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2143             else:
2144                 format_list = self._available_formats
2145             existing_formats = [x for x in format_list if x in url_map]
2146             if len(existing_formats) == 0:
2147                 self._downloader.trouble(u'ERROR: no known formats available for video')
2148                 return
2149             if req_format is None:
2150                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2151             elif req_format == 'worst':
2152                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2153             elif req_format == '-1':
2154                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2155             else:
2156                 # Specific format
2157                 if req_format not in url_map:
2158                     self._downloader.trouble(u'ERROR: requested format not available')
2159                     return
2160                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2161
2162         results = []
2163         for format_param, video_real_url in video_url_list:
2164             # Extension
2165             video_extension = self._video_extensions.get(format_param, 'mp4')
2166
2167             results.append({
2168                 'id':       video_id.decode('utf-8'),
2169                 'url':      video_real_url.decode('utf-8'),
2170                 'uploader': video_uploader.decode('utf-8'),
2171                 'upload_date':  upload_date,
2172                 'title':    video_title,
2173                 'ext':      video_extension.decode('utf-8'),
2174                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2175                 'thumbnail':    video_thumbnail.decode('utf-8'),
2176                 'description':  video_description.decode('utf-8'),
2177             })
2178         return results
2179
2180 class BlipTVIE(InfoExtractor):
2181     """Information extractor for blip.tv"""
2182
2183     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2184     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2185     IE_NAME = u'blip.tv'
2186
2187     def report_extraction(self, file_id):
2188         """Report information extraction."""
2189         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2190
2191     def report_direct_download(self, title):
2192         """Report information extraction."""
2193         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2194
2195     def _real_extract(self, url):
2196         mobj = re.match(self._VALID_URL, url)
2197         if mobj is None:
2198             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2199             return
2200
2201         if '?' in url:
2202             cchar = '&'
2203         else:
2204             cchar = '?'
2205         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2206         request = compat_urllib_request.Request(json_url)
2207         self.report_extraction(mobj.group(1))
2208         info = None
2209         try:
2210             urlh = compat_urllib_request.urlopen(request)
2211             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2212                 basename = url.split('/')[-1]
2213                 title,ext = os.path.splitext(basename)
2214                 title = title.decode('UTF-8')
2215                 ext = ext.replace('.', '')
2216                 self.report_direct_download(title)
2217                 info = {
2218                     'id': title,
2219                     'url': url,
2220                     'uploader': None,
2221                     'upload_date': None,
2222                     'title': title,
2223                     'ext': ext,
2224                     'urlhandle': urlh
2225                 }
2226         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2227             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2228             return
2229         if info is None: # Regular URL
2230             try:
2231                 json_code_bytes = urlh.read()
2232                 json_code = json_code_bytes.decode('utf-8')
2233             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2234                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2235                 return
2236
2237             try:
2238                 json_data = json.loads(json_code)
2239                 if 'Post' in json_data:
2240                     data = json_data['Post']
2241                 else:
2242                     data = json_data
2243
2244                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2245                 video_url = data['media']['url']
2246                 umobj = re.match(self._URL_EXT, video_url)
2247                 if umobj is None:
2248                     raise ValueError('Can not determine filename extension')
2249                 ext = umobj.group(1)
2250
2251                 info = {
2252                     'id': data['item_id'],
2253                     'url': video_url,
2254                     'uploader': data['display_name'],
2255                     'upload_date': upload_date,
2256                     'title': data['title'],
2257                     'ext': ext,
2258                     'format': data['media']['mimeType'],
2259                     'thumbnail': data['thumbnailUrl'],
2260                     'description': data['description'],
2261                     'player_url': data['embedUrl']
2262                 }
2263             except (ValueError,KeyError) as err:
2264                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2265                 return
2266
2267         std_headers['User-Agent'] = 'iTunes/10.6.1'
2268         return [info]
2269
2270
2271 class MyVideoIE(InfoExtractor):
2272     """Information Extractor for myvideo.de."""
2273
2274     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2275     IE_NAME = u'myvideo'
2276
2277     def __init__(self, downloader=None):
2278         InfoExtractor.__init__(self, downloader)
2279
2280     def report_extraction(self, video_id):
2281         """Report information extraction."""
2282         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2283
2284     def _real_extract(self,url):
2285         mobj = re.match(self._VALID_URL, url)
2286         if mobj is None:
2287             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2288             return
2289
2290         video_id = mobj.group(1)
2291
2292         # Get video webpage
2293         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2294         webpage = self._download_webpage(webpage_url, video_id)
2295
2296         self.report_extraction(video_id)
2297         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2298                  webpage)
2299         if mobj is None:
2300             self._downloader.trouble(u'ERROR: unable to extract media URL')
2301             return
2302         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2303
2304         mobj = re.search('<title>([^<]+)</title>', webpage)
2305         if mobj is None:
2306             self._downloader.trouble(u'ERROR: unable to extract title')
2307             return
2308
2309         video_title = mobj.group(1)
2310
2311         return [{
2312             'id':       video_id,
2313             'url':      video_url,
2314             'uploader': None,
2315             'upload_date':  None,
2316             'title':    video_title,
2317             'ext':      u'flv',
2318         }]
2319
2320 class ComedyCentralIE(InfoExtractor):
2321     """Information extractor for The Daily Show and Colbert Report """
2322
2323     # urls can be abbreviations like :thedailyshow or :colbert
2324     # urls for episodes like:
2325     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2326     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2327     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2328     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2329                       |(https?://)?(www\.)?
2330                           (?P<showname>thedailyshow|colbertnation)\.com/
2331                          (full-episodes/(?P<episode>.*)|
2332                           (?P<clip>
2333                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2334                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2335                      $"""
2336
2337     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2338
2339     _video_extensions = {
2340         '3500': 'mp4',
2341         '2200': 'mp4',
2342         '1700': 'mp4',
2343         '1200': 'mp4',
2344         '750': 'mp4',
2345         '400': 'mp4',
2346     }
2347     _video_dimensions = {
2348         '3500': '1280x720',
2349         '2200': '960x540',
2350         '1700': '768x432',
2351         '1200': '640x360',
2352         '750': '512x288',
2353         '400': '384x216',
2354     }
2355
2356     def suitable(self, url):
2357         """Receives a URL and returns True if suitable for this IE."""
2358         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2359
2360     def report_extraction(self, episode_id):
2361         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2362
2363     def report_config_download(self, episode_id, media_id):
2364         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2365
2366     def report_index_download(self, episode_id):
2367         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2368
2369     def _print_formats(self, formats):
2370         print('Available formats:')
2371         for x in formats:
2372             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2373
2374
2375     def _real_extract(self, url):
2376         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2377         if mobj is None:
2378             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2379             return
2380
2381         if mobj.group('shortname'):
2382             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2383                 url = u'http://www.thedailyshow.com/full-episodes/'
2384             else:
2385                 url = u'http://www.colbertnation.com/full-episodes/'
2386             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2387             assert mobj is not None
2388
2389         if mobj.group('clip'):
2390             if mobj.group('showname') == 'thedailyshow':
2391                 epTitle = mobj.group('tdstitle')
2392             else:
2393                 epTitle = mobj.group('cntitle')
2394             dlNewest = False
2395         else:
2396             dlNewest = not mobj.group('episode')
2397             if dlNewest:
2398                 epTitle = mobj.group('showname')
2399             else:
2400                 epTitle = mobj.group('episode')
2401
2402         req = compat_urllib_request.Request(url)
2403         self.report_extraction(epTitle)
2404         try:
2405             htmlHandle = compat_urllib_request.urlopen(req)
2406             html = htmlHandle.read()
2407         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2408             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2409             return
2410         if dlNewest:
2411             url = htmlHandle.geturl()
2412             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2413             if mobj is None:
2414                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2415                 return
2416             if mobj.group('episode') == '':
2417                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2418                 return
2419             epTitle = mobj.group('episode')
2420
2421         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2422
2423         if len(mMovieParams) == 0:
2424             # The Colbert Report embeds the information in a without
2425             # a URL prefix; so extract the alternate reference
2426             # and then add the URL prefix manually.
2427
2428             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2429             if len(altMovieParams) == 0:
2430                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2431                 return
2432             else:
2433                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2434
2435         uri = mMovieParams[0][1]
2436         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2437         self.report_index_download(epTitle)
2438         try:
2439             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2440         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2441             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2442             return
2443
2444         results = []
2445
2446         idoc = xml.etree.ElementTree.fromstring(indexXml)
2447         itemEls = idoc.findall('.//item')
2448         for partNum,itemEl in enumerate(itemEls):
2449             mediaId = itemEl.findall('./guid')[0].text
2450             shortMediaId = mediaId.split(':')[-1]
2451             showId = mediaId.split(':')[-2].replace('.com', '')
2452             officialTitle = itemEl.findall('./title')[0].text
2453             officialDate = itemEl.findall('./pubDate')[0].text
2454
2455             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2456                         compat_urllib_parse.urlencode({'uri': mediaId}))
2457             configReq = compat_urllib_request.Request(configUrl)
2458             self.report_config_download(epTitle, shortMediaId)
2459             try:
2460                 configXml = compat_urllib_request.urlopen(configReq).read()
2461             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2462                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2463                 return
2464
2465             cdoc = xml.etree.ElementTree.fromstring(configXml)
2466             turls = []
2467             for rendition in cdoc.findall('.//rendition'):
2468                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2469                 turls.append(finfo)
2470
2471             if len(turls) == 0:
2472                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2473                 continue
2474
2475             if self._downloader.params.get('listformats', None):
2476                 self._print_formats([i[0] for i in turls])
2477                 return
2478
2479             # For now, just pick the highest bitrate
2480             format,rtmp_video_url = turls[-1]
2481
2482             # Get the format arg from the arg stream
2483             req_format = self._downloader.params.get('format', None)
2484
2485             # Select format if we can find one
2486             for f,v in turls:
2487                 if f == req_format:
2488                     format, rtmp_video_url = f, v
2489                     break
2490
2491             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2492             if not m:
2493                 raise ExtractorError(u'Cannot transform RTMP url')
2494             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2495             video_url = base + m.group('finalid')
2496
2497             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2498             info = {
2499                 'id': shortMediaId,
2500                 'url': video_url,
2501                 'uploader': showId,
2502                 'upload_date': officialDate,
2503                 'title': effTitle,
2504                 'ext': 'mp4',
2505                 'format': format,
2506                 'thumbnail': None,
2507                 'description': officialTitle,
2508             }
2509             results.append(info)
2510
2511         return results
2512
2513
2514 class EscapistIE(InfoExtractor):
2515     """Information extractor for The Escapist """
2516
2517     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2518     IE_NAME = u'escapist'
2519
2520     def report_extraction(self, showName):
2521         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2522
2523     def report_config_download(self, showName):
2524         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2525
2526     def _real_extract(self, url):
2527         mobj = re.match(self._VALID_URL, url)
2528         if mobj is None:
2529             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2530             return
2531         showName = mobj.group('showname')
2532         videoId = mobj.group('episode')
2533
2534         self.report_extraction(showName)
2535         try:
2536             webPage = compat_urllib_request.urlopen(url)
2537             webPageBytes = webPage.read()
2538             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2539             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2540         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2541             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2542             return
2543
2544         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2545         description = unescapeHTML(descMatch.group(1))
2546         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2547         imgUrl = unescapeHTML(imgMatch.group(1))
2548         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2549         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2550         configUrlMatch = re.search('config=(.*)$', playerUrl)
2551         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2552
2553         self.report_config_download(showName)
2554         try:
2555             configJSON = compat_urllib_request.urlopen(configUrl)
2556             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2557             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2558         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2559             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2560             return
2561
2562         # Technically, it's JavaScript, not JSON
2563         configJSON = configJSON.replace("'", '"')
2564
2565         try:
2566             config = json.loads(configJSON)
2567         except (ValueError,) as err:
2568             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2569             return
2570
2571         playlist = config['playlist']
2572         videoUrl = playlist[1]['url']
2573
2574         info = {
2575             'id': videoId,
2576             'url': videoUrl,
2577             'uploader': showName,
2578             'upload_date': None,
2579             'title': showName,
2580             'ext': 'flv',
2581             'thumbnail': imgUrl,
2582             'description': description,
2583             'player_url': playerUrl,
2584         }
2585
2586         return [info]
2587
2588 class CollegeHumorIE(InfoExtractor):
2589     """Information extractor for collegehumor.com"""
2590
2591     _WORKING = False
2592     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2593     IE_NAME = u'collegehumor'
2594
2595     def report_manifest(self, video_id):
2596         """Report information extraction."""
2597         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2598
2599     def report_extraction(self, video_id):
2600         """Report information extraction."""
2601         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2602
2603     def _real_extract(self, url):
2604         mobj = re.match(self._VALID_URL, url)
2605         if mobj is None:
2606             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2607             return
2608         video_id = mobj.group('videoid')
2609
2610         info = {
2611             'id': video_id,
2612             'uploader': None,
2613             'upload_date': None,
2614         }
2615
2616         self.report_extraction(video_id)
2617         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2618         try:
2619             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2620         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2621             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2622             return
2623
2624         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2625         try:
2626             videoNode = mdoc.findall('./video')[0]
2627             info['description'] = videoNode.findall('./description')[0].text
2628             info['title'] = videoNode.findall('./caption')[0].text
2629             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2630             manifest_url = videoNode.findall('./file')[0].text
2631         except IndexError:
2632             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2633             return
2634
2635         manifest_url += '?hdcore=2.10.3'
2636         self.report_manifest(video_id)
2637         try:
2638             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2639         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2640             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2641             return
2642
2643         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2644         try:
2645             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2646             node_id = media_node.attrib['url']
2647             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2648         except IndexError as err:
2649             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2650             return
2651
2652         url_pr = compat_urllib_parse_urlparse(manifest_url)
2653         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2654
2655         info['url'] = url
2656         info['ext'] = 'f4f'
2657         return [info]
2658
2659
2660 class XVideosIE(InfoExtractor):
2661     """Information extractor for xvideos.com"""
2662
2663     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2664     IE_NAME = u'xvideos'
2665
2666     def report_extraction(self, video_id):
2667         """Report information extraction."""
2668         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2669
2670     def _real_extract(self, url):
2671         mobj = re.match(self._VALID_URL, url)
2672         if mobj is None:
2673             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2674             return
2675         video_id = mobj.group(1)
2676
2677         webpage = self._download_webpage(url, video_id)
2678
2679         self.report_extraction(video_id)
2680
2681
2682         # Extract video URL
2683         mobj = re.search(r'flv_url=(.+?)&', webpage)
2684         if mobj is None:
2685             self._downloader.trouble(u'ERROR: unable to extract video url')
2686             return
2687         video_url = compat_urllib_parse.unquote(mobj.group(1))
2688
2689
2690         # Extract title
2691         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2692         if mobj is None:
2693             self._downloader.trouble(u'ERROR: unable to extract video title')
2694             return
2695         video_title = mobj.group(1)
2696
2697
2698         # Extract video thumbnail
2699         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2700         if mobj is None:
2701             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2702             return
2703         video_thumbnail = mobj.group(0)
2704
2705         info = {
2706             'id': video_id,
2707             'url': video_url,
2708             'uploader': None,
2709             'upload_date': None,
2710             'title': video_title,
2711             'ext': 'flv',
2712             'thumbnail': video_thumbnail,
2713             'description': None,
2714         }
2715
2716         return [info]
2717
2718
2719 class SoundcloudIE(InfoExtractor):
2720     """Information extractor for soundcloud.com
2721        To access the media, the uid of the song and a stream token
2722        must be extracted from the page source and the script must make
2723        a request to media.soundcloud.com/crossdomain.xml. Then
2724        the media can be grabbed by requesting from an url composed
2725        of the stream token and uid
2726      """
2727
2728     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2729     IE_NAME = u'soundcloud'
2730
2731     def __init__(self, downloader=None):
2732         InfoExtractor.__init__(self, downloader)
2733
2734     def report_resolve(self, video_id):
2735         """Report information extraction."""
2736         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2737
2738     def report_extraction(self, video_id):
2739         """Report information extraction."""
2740         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2741
2742     def _real_extract(self, url):
2743         mobj = re.match(self._VALID_URL, url)
2744         if mobj is None:
2745             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2746             return
2747
2748         # extract uploader (which is in the url)
2749         uploader = mobj.group(1)
2750         # extract simple title (uploader + slug of song title)
2751         slug_title =  mobj.group(2)
2752         simple_title = uploader + u'-' + slug_title
2753
2754         self.report_resolve('%s/%s' % (uploader, slug_title))
2755
2756         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2757         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2758         request = compat_urllib_request.Request(resolv_url)
2759         try:
2760             info_json_bytes = compat_urllib_request.urlopen(request).read()
2761             info_json = info_json_bytes.decode('utf-8')
2762         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2763             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2764             return
2765
2766         info = json.loads(info_json)
2767         video_id = info['id']
2768         self.report_extraction('%s/%s' % (uploader, slug_title))
2769
2770         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2771         request = compat_urllib_request.Request(streams_url)
2772         try:
2773             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2774             stream_json = stream_json_bytes.decode('utf-8')
2775         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2776             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2777             return
2778
2779         streams = json.loads(stream_json)
2780         mediaURL = streams['http_mp3_128_url']
2781
2782         return [{
2783             'id':       info['id'],
2784             'url':      mediaURL,
2785             'uploader': info['user']['username'],
2786             'upload_date':  info['created_at'],
2787             'title':    info['title'],
2788             'ext':      u'mp3',
2789             'description': info['description'],
2790         }]
2791
2792
2793 class InfoQIE(InfoExtractor):
2794     """Information extractor for infoq.com"""
2795     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2796
2797     def report_extraction(self, video_id):
2798         """Report information extraction."""
2799         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2800
2801     def _real_extract(self, url):
2802         mobj = re.match(self._VALID_URL, url)
2803         if mobj is None:
2804             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2805             return
2806
2807         webpage = self._download_webpage(url, video_id=url)
2808         self.report_extraction(url)
2809
2810         # Extract video URL
2811         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2812         if mobj is None:
2813             self._downloader.trouble(u'ERROR: unable to extract video url')
2814             return
2815         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2816         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2817
2818         # Extract title
2819         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2820         if mobj is None:
2821             self._downloader.trouble(u'ERROR: unable to extract video title')
2822             return
2823         video_title = mobj.group(1)
2824
2825         # Extract description
2826         video_description = u'No description available.'
2827         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2828         if mobj is not None:
2829             video_description = mobj.group(1)
2830
2831         video_filename = video_url.split('/')[-1]
2832         video_id, extension = video_filename.split('.')
2833
2834         info = {
2835             'id': video_id,
2836             'url': video_url,
2837             'uploader': None,
2838             'upload_date': None,
2839             'title': video_title,
2840             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2841             'thumbnail': None,
2842             'description': video_description,
2843         }
2844
2845         return [info]
2846
2847 class MixcloudIE(InfoExtractor):
2848     """Information extractor for www.mixcloud.com"""
2849
2850     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2851     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2852     IE_NAME = u'mixcloud'
2853
2854     def __init__(self, downloader=None):
2855         InfoExtractor.__init__(self, downloader)
2856
2857     def report_download_json(self, file_id):
2858         """Report JSON download."""
2859         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2860
2861     def report_extraction(self, file_id):
2862         """Report information extraction."""
2863         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2864
2865     def get_urls(self, jsonData, fmt, bitrate='best'):
2866         """Get urls from 'audio_formats' section in json"""
2867         file_url = None
2868         try:
2869             bitrate_list = jsonData[fmt]
2870             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2871                 bitrate = max(bitrate_list) # select highest
2872
2873             url_list = jsonData[fmt][bitrate]
2874         except TypeError: # we have no bitrate info.
2875             url_list = jsonData[fmt]
2876         return url_list
2877
2878     def check_urls(self, url_list):
2879         """Returns 1st active url from list"""
2880         for url in url_list:
2881             try:
2882                 compat_urllib_request.urlopen(url)
2883                 return url
2884             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2885                 url = None
2886
2887         return None
2888
2889     def _print_formats(self, formats):
2890         print('Available formats:')
2891         for fmt in formats.keys():
2892             for b in formats[fmt]:
2893                 try:
2894                     ext = formats[fmt][b][0]
2895                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2896                 except TypeError: # we have no bitrate info
2897                     ext = formats[fmt][0]
2898                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2899                     break
2900
2901     def _real_extract(self, url):
2902         mobj = re.match(self._VALID_URL, url)
2903         if mobj is None:
2904             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2905             return
2906         # extract uploader & filename from url
2907         uploader = mobj.group(1).decode('utf-8')
2908         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2909
2910         # construct API request
2911         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2912         # retrieve .json file with links to files
2913         request = compat_urllib_request.Request(file_url)
2914         try:
2915             self.report_download_json(file_url)
2916             jsonData = compat_urllib_request.urlopen(request).read()
2917         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2918             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2919             return
2920
2921         # parse JSON
2922         json_data = json.loads(jsonData)
2923         player_url = json_data['player_swf_url']
2924         formats = dict(json_data['audio_formats'])
2925
2926         req_format = self._downloader.params.get('format', None)
2927         bitrate = None
2928
2929         if self._downloader.params.get('listformats', None):
2930             self._print_formats(formats)
2931             return
2932
2933         if req_format is None or req_format == 'best':
2934             for format_param in formats.keys():
2935                 url_list = self.get_urls(formats, format_param)
2936                 # check urls
2937                 file_url = self.check_urls(url_list)
2938                 if file_url is not None:
2939                     break # got it!
2940         else:
2941             if req_format not in formats:
2942                 self._downloader.trouble(u'ERROR: format is not available')
2943                 return
2944
2945             url_list = self.get_urls(formats, req_format)
2946             file_url = self.check_urls(url_list)
2947             format_param = req_format
2948
2949         return [{
2950             'id': file_id.decode('utf-8'),
2951             'url': file_url.decode('utf-8'),
2952             'uploader': uploader.decode('utf-8'),
2953             'upload_date': None,
2954             'title': json_data['name'],
2955             'ext': file_url.split('.')[-1].decode('utf-8'),
2956             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2957             'thumbnail': json_data['thumbnail_url'],
2958             'description': json_data['description'],
2959             'player_url': player_url.decode('utf-8'),
2960         }]
2961
2962 class StanfordOpenClassroomIE(InfoExtractor):
2963     """Information extractor for Stanford's Open ClassRoom"""
2964
2965     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2966     IE_NAME = u'stanfordoc'
2967
2968     def report_download_webpage(self, objid):
2969         """Report information extraction."""
2970         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2971
2972     def report_extraction(self, video_id):
2973         """Report information extraction."""
2974         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2975
2976     def _real_extract(self, url):
2977         mobj = re.match(self._VALID_URL, url)
2978         if mobj is None:
2979             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2980             return
2981
2982         if mobj.group('course') and mobj.group('video'): # A specific video
2983             course = mobj.group('course')
2984             video = mobj.group('video')
2985             info = {
2986                 'id': course + '_' + video,
2987                 'uploader': None,
2988                 'upload_date': None,
2989             }
2990
2991             self.report_extraction(info['id'])
2992             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2993             xmlUrl = baseUrl + video + '.xml'
2994             try:
2995                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2996             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2997                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2998                 return
2999             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3000             try:
3001                 info['title'] = mdoc.findall('./title')[0].text
3002                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3003             except IndexError:
3004                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3005                 return
3006             info['ext'] = info['url'].rpartition('.')[2]
3007             return [info]
3008         elif mobj.group('course'): # A course page
3009             course = mobj.group('course')
3010             info = {
3011                 'id': course,
3012                 'type': 'playlist',
3013                 'uploader': None,
3014                 'upload_date': None,
3015             }
3016
3017             self.report_download_webpage(info['id'])
3018             try:
3019                 coursepage = compat_urllib_request.urlopen(url).read()
3020             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3021                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3022                 return
3023
3024             m = re.search('<h1>([^<]+)</h1>', coursepage)
3025             if m:
3026                 info['title'] = unescapeHTML(m.group(1))
3027             else:
3028                 info['title'] = info['id']
3029
3030             m = re.search('<description>([^<]+)</description>', coursepage)
3031             if m:
3032                 info['description'] = unescapeHTML(m.group(1))
3033
3034             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3035             info['list'] = [
3036                 {
3037                     'type': 'reference',
3038                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3039                 }
3040                     for vpage in links]
3041             results = []
3042             for entry in info['list']:
3043                 assert entry['type'] == 'reference'
3044                 results += self.extract(entry['url'])
3045             return results
3046
3047         else: # Root page
3048             info = {
3049                 'id': 'Stanford OpenClassroom',
3050                 'type': 'playlist',
3051                 'uploader': None,
3052                 'upload_date': None,
3053             }
3054
3055             self.report_download_webpage(info['id'])
3056             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3057             try:
3058                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3059             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3060                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3061                 return
3062
3063             info['title'] = info['id']
3064
3065             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3066             info['list'] = [
3067                 {
3068                     'type': 'reference',
3069                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3070                 }
3071                     for cpage in links]
3072
3073             results = []
3074             for entry in info['list']:
3075                 assert entry['type'] == 'reference'
3076                 results += self.extract(entry['url'])
3077             return results
3078
3079 class MTVIE(InfoExtractor):
3080     """Information extractor for MTV.com"""
3081
3082     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3083     IE_NAME = u'mtv'
3084
3085     def report_extraction(self, video_id):
3086         """Report information extraction."""
3087         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3088
3089     def _real_extract(self, url):
3090         mobj = re.match(self._VALID_URL, url)
3091         if mobj is None:
3092             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3093             return
3094         if not mobj.group('proto'):
3095             url = 'http://' + url
3096         video_id = mobj.group('videoid')
3097
3098         webpage = self._download_webpage(url, video_id)
3099
3100         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3101         if mobj is None:
3102             self._downloader.trouble(u'ERROR: unable to extract song name')
3103             return
3104         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3105         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3106         if mobj is None:
3107             self._downloader.trouble(u'ERROR: unable to extract performer')
3108             return
3109         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3110         video_title = performer + ' - ' + song_name
3111
3112         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3113         if mobj is None:
3114             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3115             return
3116         mtvn_uri = mobj.group(1)
3117
3118         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3119         if mobj is None:
3120             self._downloader.trouble(u'ERROR: unable to extract content id')
3121             return
3122         content_id = mobj.group(1)
3123
3124         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3125         self.report_extraction(video_id)
3126         request = compat_urllib_request.Request(videogen_url)
3127         try:
3128             metadataXml = compat_urllib_request.urlopen(request).read()
3129         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3130             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3131             return
3132
3133         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3134         renditions = mdoc.findall('.//rendition')
3135
3136         # For now, always pick the highest quality.
3137         rendition = renditions[-1]
3138
3139         try:
3140             _,_,ext = rendition.attrib['type'].partition('/')
3141             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3142             video_url = rendition.find('./src').text
3143         except KeyError:
3144             self._downloader.trouble('Invalid rendition field.')
3145             return
3146
3147         info = {
3148             'id': video_id,
3149             'url': video_url,
3150             'uploader': performer,
3151             'upload_date': None,
3152             'title': video_title,
3153             'ext': ext,
3154             'format': format,
3155         }
3156
3157         return [info]
3158
3159
3160 class YoukuIE(InfoExtractor):
3161     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3162
3163     def report_download_webpage(self, file_id):
3164         """Report webpage download."""
3165         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3166
3167     def report_extraction(self, file_id):
3168         """Report information extraction."""
3169         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3170
3171     def _gen_sid(self):
3172         nowTime = int(time.time() * 1000)
3173         random1 = random.randint(1000,1998)
3174         random2 = random.randint(1000,9999)
3175
3176         return "%d%d%d" %(nowTime,random1,random2)
3177
3178     def _get_file_ID_mix_string(self, seed):
3179         mixed = []
3180         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3181         seed = float(seed)
3182         for i in range(len(source)):
3183             seed  =  (seed * 211 + 30031 ) % 65536
3184             index  =  math.floor(seed / 65536 * len(source) )
3185             mixed.append(source[int(index)])
3186             source.remove(source[int(index)])
3187         #return ''.join(mixed)
3188         return mixed
3189
3190     def _get_file_id(self, fileId, seed):
3191         mixed = self._get_file_ID_mix_string(seed)
3192         ids = fileId.split('*')
3193         realId = []
3194         for ch in ids:
3195             if ch:
3196                 realId.append(mixed[int(ch)])
3197         return ''.join(realId)
3198
3199     def _real_extract(self, url):
3200         mobj = re.match(self._VALID_URL, url)
3201         if mobj is None:
3202             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3203             return
3204         video_id = mobj.group('ID')
3205
3206         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3207
3208         request = compat_urllib_request.Request(info_url, None, std_headers)
3209         try:
3210             self.report_download_webpage(video_id)
3211             jsondata = compat_urllib_request.urlopen(request).read()
3212         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3213             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3214             return
3215
3216         self.report_extraction(video_id)
3217         try:
3218             jsonstr = jsondata.decode('utf-8')
3219             config = json.loads(jsonstr)
3220
3221             video_title =  config['data'][0]['title']
3222             seed = config['data'][0]['seed']
3223
3224             format = self._downloader.params.get('format', None)
3225             supported_format = list(config['data'][0]['streamfileids'].keys())
3226
3227             if format is None or format == 'best':
3228                 if 'hd2' in supported_format:
3229                     format = 'hd2'
3230                 else:
3231                     format = 'flv'
3232                 ext = u'flv'
3233             elif format == 'worst':
3234                 format = 'mp4'
3235                 ext = u'mp4'
3236             else:
3237                 format = 'flv'
3238                 ext = u'flv'
3239
3240
3241             fileid = config['data'][0]['streamfileids'][format]
3242             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3243         except (UnicodeDecodeError, ValueError, KeyError):
3244             self._downloader.trouble(u'ERROR: unable to extract info section')
3245             return
3246
3247         files_info=[]
3248         sid = self._gen_sid()
3249         fileid = self._get_file_id(fileid, seed)
3250
3251         #column 8,9 of fileid represent the segment number
3252         #fileid[7:9] should be changed
3253         for index, key in enumerate(keys):
3254
3255             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3256             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3257
3258             info = {
3259                 'id': '%s_part%02d' % (video_id, index),
3260                 'url': download_url,
3261                 'uploader': None,
3262                 'upload_date': None,
3263                 'title': video_title,
3264                 'ext': ext,
3265             }
3266             files_info.append(info)
3267
3268         return files_info
3269
3270
3271 class XNXXIE(InfoExtractor):
3272     """Information extractor for xnxx.com"""
3273
3274     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3275     IE_NAME = u'xnxx'
3276     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3277     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3278     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3279
3280     def report_webpage(self, video_id):
3281         """Report information extraction"""
3282         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3283
3284     def report_extraction(self, video_id):
3285         """Report information extraction"""
3286         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3287
3288     def _real_extract(self, url):
3289         mobj = re.match(self._VALID_URL, url)
3290         if mobj is None:
3291             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3292             return
3293         video_id = mobj.group(1)
3294
3295         self.report_webpage(video_id)
3296
3297         # Get webpage content
3298         try:
3299             webpage_bytes = compat_urllib_request.urlopen(url).read()
3300             webpage = webpage_bytes.decode('utf-8')
3301         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3302             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3303             return
3304
3305         result = re.search(self.VIDEO_URL_RE, webpage)
3306         if result is None:
3307             self._downloader.trouble(u'ERROR: unable to extract video url')
3308             return
3309         video_url = compat_urllib_parse.unquote(result.group(1))
3310
3311         result = re.search(self.VIDEO_TITLE_RE, webpage)
3312         if result is None:
3313             self._downloader.trouble(u'ERROR: unable to extract video title')
3314             return
3315         video_title = result.group(1)
3316
3317         result = re.search(self.VIDEO_THUMB_RE, webpage)
3318         if result is None:
3319             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3320             return
3321         video_thumbnail = result.group(1)
3322
3323         return [{
3324             'id': video_id,
3325             'url': video_url,
3326             'uploader': None,
3327             'upload_date': None,
3328             'title': video_title,
3329             'ext': 'flv',
3330             'thumbnail': video_thumbnail,
3331             'description': None,
3332         }]
3333
3334
3335 class GooglePlusIE(InfoExtractor):
3336     """Information extractor for plus.google.com."""
3337
3338     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3339     IE_NAME = u'plus.google'
3340
3341     def __init__(self, downloader=None):
3342         InfoExtractor.__init__(self, downloader)
3343
3344     def report_extract_entry(self, url):
3345         """Report downloading extry"""
3346         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3347
3348     def report_date(self, upload_date):
3349         """Report downloading extry"""
3350         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3351
3352     def report_uploader(self, uploader):
3353         """Report downloading extry"""
3354         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3355
3356     def report_title(self, video_title):
3357         """Report downloading extry"""
3358         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3359
3360     def report_extract_vid_page(self, video_page):
3361         """Report information extraction."""
3362         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3363
3364     def _real_extract(self, url):
3365         # Extract id from URL
3366         mobj = re.match(self._VALID_URL, url)
3367         if mobj is None:
3368             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3369             return
3370
3371         post_url = mobj.group(0)
3372         video_id = mobj.group(1)
3373
3374         video_extension = 'flv'
3375
3376         # Step 1, Retrieve post webpage to extract further information
3377         self.report_extract_entry(post_url)
3378         request = compat_urllib_request.Request(post_url)
3379         try:
3380             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3381         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3382             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3383             return
3384
3385         # Extract update date
3386         upload_date = None
3387         pattern = 'title="Timestamp">(.*?)</a>'
3388         mobj = re.search(pattern, webpage)
3389         if mobj:
3390             upload_date = mobj.group(1)
3391             # Convert timestring to a format suitable for filename
3392             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3393             upload_date = upload_date.strftime('%Y%m%d')
3394         self.report_date(upload_date)
3395
3396         # Extract uploader
3397         uploader = None
3398         pattern = r'rel\="author".*?>(.*?)</a>'
3399         mobj = re.search(pattern, webpage)
3400         if mobj:
3401             uploader = mobj.group(1)
3402         self.report_uploader(uploader)
3403
3404         # Extract title
3405         # Get the first line for title
3406         video_title = u'NA'
3407         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3408         mobj = re.search(pattern, webpage)
3409         if mobj:
3410             video_title = mobj.group(1)
3411         self.report_title(video_title)
3412
3413         # Step 2, Stimulate clicking the image box to launch video
3414         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3415         mobj = re.search(pattern, webpage)
3416         if mobj is None:
3417             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3418
3419         video_page = mobj.group(1)
3420         request = compat_urllib_request.Request(video_page)
3421         try:
3422             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3423         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3424             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3425             return
3426         self.report_extract_vid_page(video_page)
3427
3428
3429         # Extract video links on video page
3430         """Extract video links of all sizes"""
3431         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3432         mobj = re.findall(pattern, webpage)
3433         if len(mobj) == 0:
3434             self._downloader.trouble(u'ERROR: unable to extract video links')
3435
3436         # Sort in resolution
3437         links = sorted(mobj)
3438
3439         # Choose the lowest of the sort, i.e. highest resolution
3440         video_url = links[-1]
3441         # Only get the url. The resolution part in the tuple has no use anymore
3442         video_url = video_url[-1]
3443         # Treat escaped \u0026 style hex
3444         try:
3445             video_url = video_url.decode("unicode_escape")
3446         except AttributeError: # Python 3
3447             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3448
3449
3450         return [{
3451             'id':       video_id,
3452             'url':      video_url,
3453             'uploader': uploader,
3454             'upload_date':  upload_date,
3455             'title':    video_title,
3456             'ext':      video_extension,
3457         }]
3458
3459 class NBAIE(InfoExtractor):
3460     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3461     IE_NAME = u'nba'
3462
3463     def _real_extract(self, url):
3464         mobj = re.match(self._VALID_URL, url)
3465         if mobj is None:
3466             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3467             return
3468
3469         video_id = mobj.group(1)
3470         if video_id.endswith('/index.html'):
3471             video_id = video_id[:-len('/index.html')]
3472
3473         webpage = self._download_webpage(url, video_id)
3474
3475         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3476         def _findProp(rexp, default=None):
3477             m = re.search(rexp, webpage)
3478             if m:
3479                 return unescapeHTML(m.group(1))
3480             else:
3481                 return default
3482
3483         shortened_video_id = video_id.rpartition('/')[2]
3484         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3485         info = {
3486             'id': shortened_video_id,
3487             'url': video_url,
3488             'ext': 'mp4',
3489             'title': title,
3490             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3491             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3492         }
3493         return [info]
3494
3495 class JustinTVIE(InfoExtractor):
3496     """Information extractor for justin.tv and twitch.tv"""
3497     # TODO: One broadcast may be split into multiple videos. The key
3498     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3499     # starts at 1 and increases. Can we treat all parts as one video?
3500
3501     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3502         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3503     _JUSTIN_PAGE_LIMIT = 100
3504     IE_NAME = u'justin.tv'
3505
3506     def report_extraction(self, file_id):
3507         """Report information extraction."""
3508         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3509
3510     def report_download_page(self, channel, offset):
3511         """Report attempt to download a single page of videos."""
3512         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3513                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3514
3515     # Return count of items, list of *valid* items
3516     def _parse_page(self, url):
3517         try:
3518             urlh = compat_urllib_request.urlopen(url)
3519             webpage_bytes = urlh.read()
3520             webpage = webpage_bytes.decode('utf-8', 'ignore')
3521         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3522             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3523             return
3524
3525         response = json.loads(webpage)
3526         info = []
3527         for clip in response:
3528             video_url = clip['video_file_url']
3529             if video_url:
3530                 video_extension = os.path.splitext(video_url)[1][1:]
3531                 video_date = re.sub('-', '', clip['created_on'][:10])
3532                 info.append({
3533                     'id': clip['id'],
3534                     'url': video_url,
3535                     'title': clip['title'],
3536                     'uploader': clip.get('user_id', clip.get('channel_id')),
3537                     'upload_date': video_date,
3538                     'ext': video_extension,
3539                 })
3540         return (len(response), info)
3541
3542     def _real_extract(self, url):
3543         mobj = re.match(self._VALID_URL, url)
3544         if mobj is None:
3545             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3546             return
3547
3548         api = 'http://api.justin.tv'
3549         video_id = mobj.group(mobj.lastindex)
3550         paged = False
3551         if mobj.lastindex == 1:
3552             paged = True
3553             api += '/channel/archives/%s.json'
3554         else:
3555             api += '/clip/show/%s.json'
3556         api = api % (video_id,)
3557
3558         self.report_extraction(video_id)
3559
3560         info = []
3561         offset = 0
3562         limit = self._JUSTIN_PAGE_LIMIT
3563         while True:
3564             if paged:
3565                 self.report_download_page(video_id, offset)
3566             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3567             page_count, page_info = self._parse_page(page_url)
3568             info.extend(page_info)
3569             if not paged or page_count != limit:
3570                 break
3571             offset += limit
3572         return info
3573
3574 class FunnyOrDieIE(InfoExtractor):
3575     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3576
3577     def _real_extract(self, url):
3578         mobj = re.match(self._VALID_URL, url)
3579         if mobj is None:
3580             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3581             return
3582
3583         video_id = mobj.group('id')
3584         webpage = self._download_webpage(url, video_id)
3585
3586         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3587         if not m:
3588             self._downloader.trouble(u'ERROR: unable to find video information')
3589         video_url = unescapeHTML(m.group('url'))
3590
3591         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3592         if not m:
3593             self._downloader.trouble(u'Cannot find video title')
3594         title = unescapeHTML(m.group('title'))
3595
3596         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3597         if m:
3598             desc = unescapeHTML(m.group('desc'))
3599         else:
3600             desc = None
3601
3602         info = {
3603             'id': video_id,
3604             'url': video_url,
3605             'ext': 'mp4',
3606             'title': title,
3607             'description': desc,
3608         }
3609         return [info]
3610
3611 class TweetReelIE(InfoExtractor):
3612     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3613
3614     def _real_extract(self, url):
3615         mobj = re.match(self._VALID_URL, url)
3616         if mobj is None:
3617             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3618             return
3619
3620         video_id = mobj.group('id')
3621         webpage = self._download_webpage(url, video_id)
3622
3623         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3624         if not m:
3625             self._downloader.trouble(u'ERROR: Cannot find status ID')
3626         status_id = m.group(1)
3627
3628         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3629         if not m:
3630             self._downloader.trouble(u'WARNING: Cannot find description')
3631         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3632
3633         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3634         if not m:
3635             self._downloader.trouble(u'ERROR: Cannot find uploader')
3636         uploader = unescapeHTML(m.group('uploader'))
3637         uploader_id = unescapeHTML(m.group('uploader_id'))
3638
3639         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3640         if not m:
3641             self._downloader.trouble(u'ERROR: Cannot find upload date')
3642         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3643
3644         title = desc
3645         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3646
3647         info = {
3648             'id': video_id,
3649             'url': video_url,
3650             'ext': 'mov',
3651             'title': title,
3652             'description': desc,
3653             'uploader': uploader,
3654             'uploader_id': uploader_id,
3655             'internal_id': status_id,
3656             'upload_date': upload_date
3657         }
3658         return [info]
3659
3660 class SteamIE(InfoExtractor):
3661     _VALID_URL = r"""http://store.steampowered.com/
3662                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3663                 (?P<gameID>\d+)/?
3664                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3665                 """
3666
3667     def suitable(self, url):
3668         """Receives a URL and returns True if suitable for this IE."""
3669         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3670
3671     def _real_extract(self, url):
3672         m = re.match(self._VALID_URL, url, re.VERBOSE)
3673         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3674         gameID = m.group('gameID')
3675         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3676         webpage = self._download_webpage(videourl, gameID)
3677         mweb = re.finditer(urlRE, webpage)
3678         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3679         titles = re.finditer(namesRE, webpage)
3680         videos = []
3681         for vid,vtitle in zip(mweb,titles):
3682             video_id = vid.group('videoID')
3683             title = vtitle.group('videoName')
3684             video_url = vid.group('videoURL')
3685             if not video_url:
3686                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3687             info = {
3688                 'id':video_id,
3689                 'url':video_url,
3690                 'ext': 'flv',
3691                 'title': unescapeHTML(title)
3692                   }
3693             videos.append(info)
3694         return videos
3695
3696 class UstreamIE(InfoExtractor):
3697     _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3698     IE_NAME = u'ustream'
3699
3700     def _real_extract(self, url):
3701         m = re.match(self._VALID_URL, url)
3702         video_id = m.group('videoID')
3703         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3704         webpage = self._download_webpage(url, video_id)
3705         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3706         title = m.group('title')
3707         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3708         uploader = m.group('uploader')
3709         info = {
3710                 'id':video_id,
3711                 'url':video_url,
3712                 'ext': 'flv',
3713                 'title': title,
3714                 'uploader': uploader
3715                   }
3716         return [info]
3717
3718
3719 def gen_extractors():
3720     """ Return a list of an instance of every supported extractor.
3721     The order does matter; the first extractor matched is the one handling the URL.
3722     """
3723     return [
3724         YoutubePlaylistIE(),
3725         YoutubeChannelIE(),
3726         YoutubeUserIE(),
3727         YoutubeSearchIE(),
3728         YoutubeIE(),
3729         MetacafeIE(),
3730         DailymotionIE(),
3731         GoogleSearchIE(),
3732         PhotobucketIE(),
3733         YahooIE(),
3734         YahooSearchIE(),
3735         DepositFilesIE(),
3736         FacebookIE(),
3737         BlipTVUserIE(),
3738         BlipTVIE(),
3739         VimeoIE(),
3740         MyVideoIE(),
3741         ComedyCentralIE(),
3742         EscapistIE(),
3743         CollegeHumorIE(),
3744         XVideosIE(),
3745         SoundcloudIE(),
3746         InfoQIE(),
3747         MixcloudIE(),
3748         StanfordOpenClassroomIE(),
3749         MTVIE(),
3750         YoukuIE(),
3751         XNXXIE(),
3752         GooglePlusIE(),
3753         ArteTvIE(),
3754         NBAIE(),
3755         JustinTVIE(),
3756         FunnyOrDieIE(),
3757         TweetReelIE(),
3758         SteamIE(),
3759         UstreamIE(),
3760         GenericIE()
3761     ]
3762
3763