youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import netrc
   9 import os
  10 import re
  11 import socket
  12 import time
  13 import email.utils
  14 import xml.etree.ElementTree
  15 import random
  16 import math
  17
  18 from .utils import *
  19
  20
  21 class InfoExtractor(object):
  22     """Information Extractor class.
  23
  24     Information extractors are the classes that, given a URL, extract
  25     information about the video (or videos) the URL refers to. This
  26     information includes the real video URL, the video title, author and
  27     others. The information is stored in a dictionary which is then
  28     passed to the FileDownloader. The FileDownloader processes this
  29     information possibly downloading the video to the file system, among
  30     other possible outcomes.
  31
  32     The dictionaries must include the following fields:
  33
  34     id:             Video identifier.
  35     url:            Final video URL.
  36     title:          Video title, unescaped.
  37     ext:            Video filename extension.
  38     uploader:       Full name of the video uploader.
  39     upload_date:    Video upload date (YYYYMMDD).
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader_id:    Nickname or id of the video uploader.
  47     player_url:     SWF Player URL (used for rtmpdump).
  48     subtitles:      The .srt file contents.
  49     urlhandle:      [internal] The urlHandle to be used to download the file,
  50                     like returned by urllib.request.urlopen
  51
  52     The fields should all be Unicode strings.
  53
  54     Subclasses of this one should re-define the _real_initialize() and
  55     _real_extract() methods and define a _VALID_URL regexp.
  56     Probably, they should also be added to the list of extractors.
  57
  58     _real_extract() must return a *list* of information dictionaries as
  59     described above.
  60
  61     Finally, the _WORKING attribute should be set to False for broken IEs
  62     in order to warn the users and skip the tests.
  63     """
  64
  65     _ready = False
  66     _downloader = None
  67     _WORKING = True
  68
  69     def __init__(self, downloader=None):
  70         """Constructor. Receives an optional downloader."""
  71         self._ready = False
  72         self.set_downloader(downloader)
  73
  74     def suitable(self, url):
  75         """Receives a URL and returns True if suitable for this IE."""
  76         return re.match(self._VALID_URL, url) is not None
  77
  78     def working(self):
  79         """Getter method for _WORKING."""
  80         return self._WORKING
  81
  82     def initialize(self):
  83         """Initializes an instance (authentication, etc)."""
  84         if not self._ready:
  85             self._real_initialize()
  86             self._ready = True
  87
  88     def extract(self, url):
  89         """Extracts URL information and returns it in list of dicts."""
  90         self.initialize()
  91         return self._real_extract(url)
  92
  93     def set_downloader(self, downloader):
  94         """Sets the downloader for this IE."""
  95         self._downloader = downloader
  96
  97     def _real_initialize(self):
  98         """Real initialization process. Redefine in subclasses."""
  99         pass
 100
 101     def _real_extract(self, url):
 102         """Real extraction process. Redefine in subclasses."""
 103         pass
 104
 105     @property
 106     def IE_NAME(self):
 107         return type(self).__name__[:-2]
 108
 109     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 110         if note is None:
 111             note = u'Downloading video webpage'
 112         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 113         try:
 114             urlh = compat_urllib_request.urlopen(url_or_request)
 115             webpage_bytes = urlh.read()
 116             return webpage_bytes.decode('utf-8', 'replace')
 117         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 118             if errnote is None:
 119                 errnote = u'Unable to download webpage'
 120             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 121
 122
 123 class YoutubeIE(InfoExtractor):
 124     """Information extractor for youtube.com."""
 125
 126     _VALID_URL = r"""^
 127                      (
 128                          (?:https?://)?                                       # http(s):// (optional)
 129                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 130                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 131                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 132                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 133                          (?:                                                  # the various things that can precede the ID:
 134                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 135                              |(?:                                             # or the v= param in all its forms
 136                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 137                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 138                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 139                                  v=
 140                              )
 141                          )?                                                   # optional -> youtube.com/xxxx is OK
 142                      )?                                                       # all until now is optional -> you can pass the naked ID
 143                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 144                      (?(1).+)?                                                # if we found the ID, everything can follow
 145                      $"""
 146     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 147     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 148     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 149     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 150     _NETRC_MACHINE = 'youtube'
 151     # Listed in order of quality
 152     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 153     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 154     _video_extensions = {
 155         '13': '3gp',
 156         '17': 'mp4',
 157         '18': 'mp4',
 158         '22': 'mp4',
 159         '37': 'mp4',
 160         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 161         '43': 'webm',
 162         '44': 'webm',
 163         '45': 'webm',
 164         '46': 'webm',
 165     }
 166     _video_dimensions = {
 167         '5': '240x400',
 168         '6': '???',
 169         '13': '???',
 170         '17': '144x176',
 171         '18': '360x640',
 172         '22': '720x1280',
 173         '34': '360x640',
 174         '35': '480x854',
 175         '37': '1080x1920',
 176         '38': '3072x4096',
 177         '43': '360x640',
 178         '44': '480x854',
 179         '45': '720x1280',
 180         '46': '1080x1920',
 181     }
 182     IE_NAME = u'youtube'
 183
 184     def suitable(self, url):
 185         """Receives a URL and returns True if suitable for this IE."""
 186         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 187
 188     def report_lang(self):
 189         """Report attempt to set language."""
 190         self._downloader.to_screen(u'[youtube] Setting language')
 191
 192     def report_login(self):
 193         """Report attempt to log in."""
 194         self._downloader.to_screen(u'[youtube] Logging in')
 195
 196     def report_age_confirmation(self):
 197         """Report attempt to confirm age."""
 198         self._downloader.to_screen(u'[youtube] Confirming age')
 199
 200     def report_video_webpage_download(self, video_id):
 201         """Report attempt to download video webpage."""
 202         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 203
 204     def report_video_info_webpage_download(self, video_id):
 205         """Report attempt to download video info webpage."""
 206         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 207
 208     def report_video_subtitles_download(self, video_id):
 209         """Report attempt to download video info webpage."""
 210         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 211
 212     def report_information_extraction(self, video_id):
 213         """Report attempt to extract video information."""
 214         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 215
 216     def report_unavailable_format(self, video_id, format):
 217         """Report extracted video URL."""
 218         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 219
 220     def report_rtmp_download(self):
 221         """Indicate the download will use the RTMP protocol."""
 222         self._downloader.to_screen(u'[youtube] RTMP download detected')
 223
 224     def _closed_captions_xml_to_srt(self, xml_string):
 225         srt = ''
 226         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 227         # TODO parse xml instead of regex
 228         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 229             if not dur: dur = '4'
 230             start = float(start)
 231             end = start + float(dur)
 232             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 233             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 234             caption = unescapeHTML(caption)
 235             caption = unescapeHTML(caption) # double cycle, intentional
 236             srt += str(n+1) + '\n'
 237             srt += start + ' --> ' + end + '\n'
 238             srt += caption + '\n\n'
 239         return srt
 240
 241     def _extract_subtitles(self, video_id):
 242         self.report_video_subtitles_download(video_id)
 243         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 244         try:
 245             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 246         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 247             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 248         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 249         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 250         if not srt_lang_list:
 251             return (u'WARNING: video has no closed captions', None)
 252         if self._downloader.params.get('subtitleslang', False):
 253             srt_lang = self._downloader.params.get('subtitleslang')
 254         elif 'en' in srt_lang_list:
 255             srt_lang = 'en'
 256         else:
 257             srt_lang = list(srt_lang_list.keys())[0]
 258         if not srt_lang in srt_lang_list:
 259             return (u'WARNING: no closed captions found in the specified language', None)
 260         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 261         try:
 262             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 263         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 264             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 265         if not srt_xml:
 266             return (u'WARNING: unable to download video subtitles', None)
 267         return (None, self._closed_captions_xml_to_srt(srt_xml))
 268
 269     def _print_formats(self, formats):
 270         print('Available formats:')
 271         for x in formats:
 272             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 273
 274     def _real_initialize(self):
 275         if self._downloader is None:
 276             return
 277
 278         username = None
 279         password = None
 280         downloader_params = self._downloader.params
 281
 282         # Attempt to use provided username and password or .netrc data
 283         if downloader_params.get('username', None) is not None:
 284             username = downloader_params['username']
 285             password = downloader_params['password']
 286         elif downloader_params.get('usenetrc', False):
 287             try:
 288                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 289                 if info is not None:
 290                     username = info[0]
 291                     password = info[2]
 292                 else:
 293                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 294             except (IOError, netrc.NetrcParseError) as err:
 295                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 296                 return
 297
 298         # Set language
 299         request = compat_urllib_request.Request(self._LANG_URL)
 300         try:
 301             self.report_lang()
 302             compat_urllib_request.urlopen(request).read()
 303         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 304             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 305             return
 306
 307         # No authentication to be performed
 308         if username is None:
 309             return
 310
 311         # Log in
 312         login_form = {
 313                 'current_form': 'loginForm',
 314                 'next':     '/',
 315                 'action_login': 'Log In',
 316                 'username': username,
 317                 'password': password,
 318                 }
 319         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 320         try:
 321             self.report_login()
 322             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 323             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 324                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 325                 return
 326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 327             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 328             return
 329
 330         # Confirm age
 331         age_form = {
 332                 'next_url':     '/',
 333                 'action_confirm':   'Confirm',
 334                 }
 335         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 336         try:
 337             self.report_age_confirmation()
 338             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 339         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 340             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 341             return
 342
 343     def _extract_id(self, url):
 344         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 345         if mobj is None:
 346             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 347             return
 348         video_id = mobj.group(2)
 349         return video_id
 350
 351     def _real_extract(self, url):
 352         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 353         mobj = re.search(self._NEXT_URL_RE, url)
 354         if mobj:
 355             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 356         video_id = self._extract_id(url)
 357
 358         # Get video webpage
 359         self.report_video_webpage_download(video_id)
 360         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 361         request = compat_urllib_request.Request(url)
 362         try:
 363             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 364         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 365             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 366             return
 367
 368         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 369
 370         # Attempt to extract SWF player URL
 371         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 372         if mobj is not None:
 373             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 374         else:
 375             player_url = None
 376
 377         # Get video info
 378         self.report_video_info_webpage_download(video_id)
 379         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 380             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 381                     % (video_id, el_type))
 382             request = compat_urllib_request.Request(video_info_url)
 383             try:
 384                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 385                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 386                 video_info = compat_parse_qs(video_info_webpage)
 387                 if 'token' in video_info:
 388                     break
 389             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 390                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 391                 return
 392         if 'token' not in video_info:
 393             if 'reason' in video_info:
 394                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 395             else:
 396                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 397             return
 398
 399         # Check for "rental" videos
 400         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 401             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 402             return
 403
 404         # Start extracting information
 405         self.report_information_extraction(video_id)
 406
 407         # uploader
 408         if 'author' not in video_info:
 409             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 410             return
 411         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 412
 413         # uploader_id
 414         video_uploader_id = None
 415         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 416         if mobj is not None:
 417             video_uploader_id = mobj.group(1)
 418         else:
 419             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 420
 421         # title
 422         if 'title' not in video_info:
 423             self._downloader.trouble(u'ERROR: unable to extract video title')
 424             return
 425         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 426
 427         # thumbnail image
 428         if 'thumbnail_url' not in video_info:
 429             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 430             video_thumbnail = ''
 431         else:   # don't panic if we can't find it
 432             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 433
 434         # upload date
 435         upload_date = None
 436         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 437         if mobj is not None:
 438             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 439             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 440             for expression in format_expressions:
 441                 try:
 442                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 443                 except:
 444                     pass
 445
 446         # description
 447         video_description = get_element_by_id("eow-description", video_webpage)
 448         if video_description:
 449             video_description = clean_html(video_description)
 450         else:
 451             video_description = ''
 452
 453         # closed captions
 454         video_subtitles = None
 455         if self._downloader.params.get('writesubtitles', False):
 456             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 457             if srt_error:
 458                 self._downloader.trouble(srt_error)
 459
 460         if 'length_seconds' not in video_info:
 461             self._downloader.trouble(u'WARNING: unable to extract video duration')
 462             video_duration = ''
 463         else:
 464             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 465
 466         # token
 467         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 468
 469         # Decide which formats to download
 470         req_format = self._downloader.params.get('format', None)
 471
 472         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 473             self.report_rtmp_download()
 474             video_url_list = [(None, video_info['conn'][0])]
 475         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 476             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 477             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 478             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 479             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 480
 481             format_limit = self._downloader.params.get('format_limit', None)
 482             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 483             if format_limit is not None and format_limit in available_formats:
 484                 format_list = available_formats[available_formats.index(format_limit):]
 485             else:
 486                 format_list = available_formats
 487             existing_formats = [x for x in format_list if x in url_map]
 488             if len(existing_formats) == 0:
 489                 self._downloader.trouble(u'ERROR: no known formats available for video')
 490                 return
 491             if self._downloader.params.get('listformats', None):
 492                 self._print_formats(existing_formats)
 493                 return
 494             if req_format is None or req_format == 'best':
 495                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 496             elif req_format == 'worst':
 497                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 498             elif req_format in ('-1', 'all'):
 499                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 500             else:
 501                 # Specific formats. We pick the first in a slash-delimeted sequence.
 502                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 503                 req_formats = req_format.split('/')
 504                 video_url_list = None
 505                 for rf in req_formats:
 506                     if rf in url_map:
 507                         video_url_list = [(rf, url_map[rf])]
 508                         break
 509                 if video_url_list is None:
 510                     self._downloader.trouble(u'ERROR: requested format not available')
 511                     return
 512         else:
 513             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 514             return
 515
 516         results = []
 517         for format_param, video_real_url in video_url_list:
 518             # Extension
 519             video_extension = self._video_extensions.get(format_param, 'flv')
 520
 521             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 522                                               self._video_dimensions.get(format_param, '???'))
 523
 524             results.append({
 525                 'id':       video_id,
 526                 'url':      video_real_url,
 527                 'uploader': video_uploader,
 528                 'uploader_id': video_uploader_id,
 529                 'upload_date':  upload_date,
 530                 'title':    video_title,
 531                 'ext':      video_extension,
 532                 'format':   video_format,
 533                 'thumbnail':    video_thumbnail,
 534                 'description':  video_description,
 535                 'player_url':   player_url,
 536                 'subtitles':    video_subtitles,
 537                 'duration':     video_duration
 538             })
 539         return results
 540
 541
 542 class MetacafeIE(InfoExtractor):
 543     """Information Extractor for metacafe.com."""
 544
 545     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 546     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 547     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 548     IE_NAME = u'metacafe'
 549
 550     def __init__(self, downloader=None):
 551         InfoExtractor.__init__(self, downloader)
 552
 553     def report_disclaimer(self):
 554         """Report disclaimer retrieval."""
 555         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 556
 557     def report_age_confirmation(self):
 558         """Report attempt to confirm age."""
 559         self._downloader.to_screen(u'[metacafe] Confirming age')
 560
 561     def report_download_webpage(self, video_id):
 562         """Report webpage download."""
 563         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 564
 565     def report_extraction(self, video_id):
 566         """Report information extraction."""
 567         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 568
 569     def _real_initialize(self):
 570         # Retrieve disclaimer
 571         request = compat_urllib_request.Request(self._DISCLAIMER)
 572         try:
 573             self.report_disclaimer()
 574             disclaimer = compat_urllib_request.urlopen(request).read()
 575         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 576             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 577             return
 578
 579         # Confirm age
 580         disclaimer_form = {
 581             'filters': '0',
 582             'submit': "Continue - I'm over 18",
 583             }
 584         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 585         try:
 586             self.report_age_confirmation()
 587             disclaimer = compat_urllib_request.urlopen(request).read()
 588         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 589             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 590             return
 591
 592     def _real_extract(self, url):
 593         # Extract id and simplified title from URL
 594         mobj = re.match(self._VALID_URL, url)
 595         if mobj is None:
 596             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 597             return
 598
 599         video_id = mobj.group(1)
 600
 601         # Check if video comes from YouTube
 602         mobj2 = re.match(r'^yt-(.*)$', video_id)
 603         if mobj2 is not None:
 604             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 605             return
 606
 607         # Retrieve video webpage to extract further information
 608         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 609         try:
 610             self.report_download_webpage(video_id)
 611             webpage = compat_urllib_request.urlopen(request).read()
 612         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 613             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 614             return
 615
 616         # Extract URL, uploader and title from webpage
 617         self.report_extraction(video_id)
 618         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 619         if mobj is not None:
 620             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 621             video_extension = mediaURL[-3:]
 622
 623             # Extract gdaKey if available
 624             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 625             if mobj is None:
 626                 video_url = mediaURL
 627             else:
 628                 gdaKey = mobj.group(1)
 629                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 630         else:
 631             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 632             if mobj is None:
 633                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 634                 return
 635             vardict = compat_parse_qs(mobj.group(1))
 636             if 'mediaData' not in vardict:
 637                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 638                 return
 639             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 640             if mobj is None:
 641                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 642                 return
 643             mediaURL = mobj.group(1).replace('\\/', '/')
 644             video_extension = mediaURL[-3:]
 645             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 646
 647         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 648         if mobj is None:
 649             self._downloader.trouble(u'ERROR: unable to extract title')
 650             return
 651         video_title = mobj.group(1).decode('utf-8')
 652
 653         mobj = re.search(r'submitter=(.*?);', webpage)
 654         if mobj is None:
 655             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 656             return
 657         video_uploader = mobj.group(1)
 658
 659         return [{
 660             'id':       video_id.decode('utf-8'),
 661             'url':      video_url.decode('utf-8'),
 662             'uploader': video_uploader.decode('utf-8'),
 663             'upload_date':  None,
 664             'title':    video_title,
 665             'ext':      video_extension.decode('utf-8'),
 666         }]
 667
 668
 669 class DailymotionIE(InfoExtractor):
 670     """Information Extractor for Dailymotion"""
 671
 672     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 673     IE_NAME = u'dailymotion'
 674
 675     def __init__(self, downloader=None):
 676         InfoExtractor.__init__(self, downloader)
 677
 678     def report_extraction(self, video_id):
 679         """Report information extraction."""
 680         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 681
 682     def _real_extract(self, url):
 683         # Extract id and simplified title from URL
 684         mobj = re.match(self._VALID_URL, url)
 685         if mobj is None:
 686             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 687             return
 688
 689         video_id = mobj.group(1).split('_')[0].split('?')[0]
 690
 691         video_extension = 'mp4'
 692
 693         # Retrieve video webpage to extract further information
 694         request = compat_urllib_request.Request(url)
 695         request.add_header('Cookie', 'family_filter=off')
 696         webpage = self._download_webpage(request, video_id)
 697
 698         # Extract URL, uploader and title from webpage
 699         self.report_extraction(video_id)
 700         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 701         if mobj is None:
 702             self._downloader.trouble(u'ERROR: unable to extract media URL')
 703             return
 704         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 705
 706         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 707             if key in flashvars:
 708                 max_quality = key
 709                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 710                 break
 711         else:
 712             self._downloader.trouble(u'ERROR: unable to extract video URL')
 713             return
 714
 715         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 716         if mobj is None:
 717             self._downloader.trouble(u'ERROR: unable to extract video URL')
 718             return
 719
 720         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 721
 722         # TODO: support choosing qualities
 723
 724         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 725         if mobj is None:
 726             self._downloader.trouble(u'ERROR: unable to extract title')
 727             return
 728         video_title = unescapeHTML(mobj.group('title'))
 729
 730         video_uploader = None
 731         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 732         if mobj is None:
 733             # lookin for official user
 734             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 735             if mobj_official is None:
 736                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 737             else:
 738                 video_uploader = mobj_official.group(1)
 739         else:
 740             video_uploader = mobj.group(1)
 741
 742         video_upload_date = None
 743         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 744         if mobj is not None:
 745             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 746
 747         return [{
 748             'id':       video_id,
 749             'url':      video_url,
 750             'uploader': video_uploader,
 751             'upload_date':  video_upload_date,
 752             'title':    video_title,
 753             'ext':      video_extension,
 754         }]
 755
 756
 757 class PhotobucketIE(InfoExtractor):
 758     """Information extractor for photobucket.com."""
 759
 760     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 761     IE_NAME = u'photobucket'
 762
 763     def __init__(self, downloader=None):
 764         InfoExtractor.__init__(self, downloader)
 765
 766     def report_download_webpage(self, video_id):
 767         """Report webpage download."""
 768         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 769
 770     def report_extraction(self, video_id):
 771         """Report information extraction."""
 772         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 773
 774     def _real_extract(self, url):
 775         # Extract id from URL
 776         mobj = re.match(self._VALID_URL, url)
 777         if mobj is None:
 778             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 779             return
 780
 781         video_id = mobj.group(1)
 782
 783         video_extension = 'flv'
 784
 785         # Retrieve video webpage to extract further information
 786         request = compat_urllib_request.Request(url)
 787         try:
 788             self.report_download_webpage(video_id)
 789             webpage = compat_urllib_request.urlopen(request).read()
 790         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 791             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 792             return
 793
 794         # Extract URL, uploader, and title from webpage
 795         self.report_extraction(video_id)
 796         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 797         if mobj is None:
 798             self._downloader.trouble(u'ERROR: unable to extract media URL')
 799             return
 800         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 801
 802         video_url = mediaURL
 803
 804         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 805         if mobj is None:
 806             self._downloader.trouble(u'ERROR: unable to extract title')
 807             return
 808         video_title = mobj.group(1).decode('utf-8')
 809
 810         video_uploader = mobj.group(2).decode('utf-8')
 811
 812         return [{
 813             'id':       video_id.decode('utf-8'),
 814             'url':      video_url.decode('utf-8'),
 815             'uploader': video_uploader,
 816             'upload_date':  None,
 817             'title':    video_title,
 818             'ext':      video_extension.decode('utf-8'),
 819         }]
 820
 821
 822 class YahooIE(InfoExtractor):
 823     """Information extractor for video.yahoo.com."""
 824
 825     _WORKING = False
 826     # _VALID_URL matches all Yahoo! Video URLs
 827     # _VPAGE_URL matches only the extractable '/watch/' URLs
 828     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 829     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 830     IE_NAME = u'video.yahoo'
 831
 832     def __init__(self, downloader=None):
 833         InfoExtractor.__init__(self, downloader)
 834
 835     def report_download_webpage(self, video_id):
 836         """Report webpage download."""
 837         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 838
 839     def report_extraction(self, video_id):
 840         """Report information extraction."""
 841         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 842
 843     def _real_extract(self, url, new_video=True):
 844         # Extract ID from URL
 845         mobj = re.match(self._VALID_URL, url)
 846         if mobj is None:
 847             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 848             return
 849
 850         video_id = mobj.group(2)
 851         video_extension = 'flv'
 852
 853         # Rewrite valid but non-extractable URLs as
 854         # extractable English language /watch/ URLs
 855         if re.match(self._VPAGE_URL, url) is None:
 856             request = compat_urllib_request.Request(url)
 857             try:
 858                 webpage = compat_urllib_request.urlopen(request).read()
 859             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 860                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 861                 return
 862
 863             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 864             if mobj is None:
 865                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 866                 return
 867             yahoo_id = mobj.group(1)
 868
 869             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 870             if mobj is None:
 871                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 872                 return
 873             yahoo_vid = mobj.group(1)
 874
 875             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 876             return self._real_extract(url, new_video=False)
 877
 878         # Retrieve video webpage to extract further information
 879         request = compat_urllib_request.Request(url)
 880         try:
 881             self.report_download_webpage(video_id)
 882             webpage = compat_urllib_request.urlopen(request).read()
 883         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 884             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 885             return
 886
 887         # Extract uploader and title from webpage
 888         self.report_extraction(video_id)
 889         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 890         if mobj is None:
 891             self._downloader.trouble(u'ERROR: unable to extract video title')
 892             return
 893         video_title = mobj.group(1).decode('utf-8')
 894
 895         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 896         if mobj is None:
 897             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 898             return
 899         video_uploader = mobj.group(1).decode('utf-8')
 900
 901         # Extract video thumbnail
 902         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 903         if mobj is None:
 904             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 905             return
 906         video_thumbnail = mobj.group(1).decode('utf-8')
 907
 908         # Extract video description
 909         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 910         if mobj is None:
 911             self._downloader.trouble(u'ERROR: unable to extract video description')
 912             return
 913         video_description = mobj.group(1).decode('utf-8')
 914         if not video_description:
 915             video_description = 'No description available.'
 916
 917         # Extract video height and width
 918         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 919         if mobj is None:
 920             self._downloader.trouble(u'ERROR: unable to extract video height')
 921             return
 922         yv_video_height = mobj.group(1)
 923
 924         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 925         if mobj is None:
 926             self._downloader.trouble(u'ERROR: unable to extract video width')
 927             return
 928         yv_video_width = mobj.group(1)
 929
 930         # Retrieve video playlist to extract media URL
 931         # I'm not completely sure what all these options are, but we
 932         # seem to need most of them, otherwise the server sends a 401.
 933         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 934         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 935         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 936                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 937                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 938         try:
 939             self.report_download_webpage(video_id)
 940             webpage = compat_urllib_request.urlopen(request).read()
 941         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 942             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 943             return
 944
 945         # Extract media URL from playlist XML
 946         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 947         if mobj is None:
 948             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 949             return
 950         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 951         video_url = unescapeHTML(video_url)
 952
 953         return [{
 954             'id':       video_id.decode('utf-8'),
 955             'url':      video_url,
 956             'uploader': video_uploader,
 957             'upload_date':  None,
 958             'title':    video_title,
 959             'ext':      video_extension.decode('utf-8'),
 960             'thumbnail':    video_thumbnail.decode('utf-8'),
 961             'description':  video_description,
 962         }]
 963
 964
 965 class VimeoIE(InfoExtractor):
 966     """Information extractor for vimeo.com."""
 967
 968     # _VALID_URL matches Vimeo URLs
 969     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 970     IE_NAME = u'vimeo'
 971
 972     def __init__(self, downloader=None):
 973         InfoExtractor.__init__(self, downloader)
 974
 975     def report_download_webpage(self, video_id):
 976         """Report webpage download."""
 977         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 978
 979     def report_extraction(self, video_id):
 980         """Report information extraction."""
 981         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 982
 983     def _real_extract(self, url, new_video=True):
 984         # Extract ID from URL
 985         mobj = re.match(self._VALID_URL, url)
 986         if mobj is None:
 987             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 988             return
 989
 990         video_id = mobj.group(1)
 991
 992         # Retrieve video webpage to extract further information
 993         request = compat_urllib_request.Request(url, None, std_headers)
 994         try:
 995             self.report_download_webpage(video_id)
 996             webpage_bytes = compat_urllib_request.urlopen(request).read()
 997             webpage = webpage_bytes.decode('utf-8')
 998         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 999             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1000             return
1001
1002         # Now we begin extracting as much information as we can from what we
1003         # retrieved. First we extract the information common to all extractors,
1004         # and latter we extract those that are Vimeo specific.
1005         self.report_extraction(video_id)
1006
1007         # Extract the config JSON
1008         try:
1009             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1010             config = json.loads(config)
1011         except:
1012             self._downloader.trouble(u'ERROR: unable to extract info section')
1013             return
1014
1015         # Extract title
1016         video_title = config["video"]["title"]
1017
1018         # Extract uploader and uploader_id
1019         video_uploader = config["video"]["owner"]["name"]
1020         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1021
1022         # Extract video thumbnail
1023         video_thumbnail = config["video"]["thumbnail"]
1024
1025         # Extract video description
1026         video_description = get_element_by_attribute("itemprop", "description", webpage)
1027         if video_description: video_description = clean_html(video_description)
1028         else: video_description = ''
1029
1030         # Extract upload date
1031         video_upload_date = None
1032         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1033         if mobj is not None:
1034             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1035
1036         # Vimeo specific: extract request signature and timestamp
1037         sig = config['request']['signature']
1038         timestamp = config['request']['timestamp']
1039
1040         # Vimeo specific: extract video codec and quality information
1041         # First consider quality, then codecs, then take everything
1042         # TODO bind to format param
1043         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1044         files = { 'hd': [], 'sd': [], 'other': []}
1045         for codec_name, codec_extension in codecs:
1046             if codec_name in config["video"]["files"]:
1047                 if 'hd' in config["video"]["files"][codec_name]:
1048                     files['hd'].append((codec_name, codec_extension, 'hd'))
1049                 elif 'sd' in config["video"]["files"][codec_name]:
1050                     files['sd'].append((codec_name, codec_extension, 'sd'))
1051                 else:
1052                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1053
1054         for quality in ('hd', 'sd', 'other'):
1055             if len(files[quality]) > 0:
1056                 video_quality = files[quality][0][2]
1057                 video_codec = files[quality][0][0]
1058                 video_extension = files[quality][0][1]
1059                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1060                 break
1061         else:
1062             self._downloader.trouble(u'ERROR: no known codec found')
1063             return
1064
1065         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1066                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1067
1068         return [{
1069             'id':       video_id,
1070             'url':      video_url,
1071             'uploader': video_uploader,
1072             'uploader_id': video_uploader_id,
1073             'upload_date':  video_upload_date,
1074             'title':    video_title,
1075             'ext':      video_extension,
1076             'thumbnail':    video_thumbnail,
1077             'description':  video_description,
1078         }]
1079
1080
1081 class ArteTvIE(InfoExtractor):
1082     """arte.tv information extractor."""
1083
1084     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1085     _LIVE_URL = r'index-[0-9]+\.html$'
1086
1087     IE_NAME = u'arte.tv'
1088
1089     def __init__(self, downloader=None):
1090         InfoExtractor.__init__(self, downloader)
1091
1092     def report_download_webpage(self, video_id):
1093         """Report webpage download."""
1094         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1095
1096     def report_extraction(self, video_id):
1097         """Report information extraction."""
1098         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1099
1100     def fetch_webpage(self, url):
1101         request = compat_urllib_request.Request(url)
1102         try:
1103             self.report_download_webpage(url)
1104             webpage = compat_urllib_request.urlopen(request).read()
1105         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1106             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1107             return
1108         except ValueError as err:
1109             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1110             return
1111         return webpage
1112
1113     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1114         page = self.fetch_webpage(url)
1115         mobj = re.search(regex, page, regexFlags)
1116         info = {}
1117
1118         if mobj is None:
1119             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1120             return
1121
1122         for (i, key, err) in matchTuples:
1123             if mobj.group(i) is None:
1124                 self._downloader.trouble(err)
1125                 return
1126             else:
1127                 info[key] = mobj.group(i)
1128
1129         return info
1130
1131     def extractLiveStream(self, url):
1132         video_lang = url.split('/')[-4]
1133         info = self.grep_webpage(
1134             url,
1135             r'src="(.*?/videothek_js.*?\.js)',
1136             0,
1137             [
1138                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1139             ]
1140         )
1141         http_host = url.split('/')[2]
1142         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1143         info = self.grep_webpage(
1144             next_url,
1145             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1146                 '(http://.*?\.swf).*?' +
1147                 '(rtmp://.*?)\'',
1148             re.DOTALL,
1149             [
1150                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1151                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1152                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1153             ]
1154         )
1155         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1156
1157     def extractPlus7Stream(self, url):
1158         video_lang = url.split('/')[-3]
1159         info = self.grep_webpage(
1160             url,
1161             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1162             0,
1163             [
1164                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1165             ]
1166         )
1167         next_url = compat_urllib_parse.unquote(info.get('url'))
1168         info = self.grep_webpage(
1169             next_url,
1170             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1171             0,
1172             [
1173                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1174             ]
1175         )
1176         next_url = compat_urllib_parse.unquote(info.get('url'))
1177
1178         info = self.grep_webpage(
1179             next_url,
1180             r'<video id="(.*?)".*?>.*?' +
1181                 '<name>(.*?)</name>.*?' +
1182                 '<dateVideo>(.*?)</dateVideo>.*?' +
1183                 '<url quality="hd">(.*?)</url>',
1184             re.DOTALL,
1185             [
1186                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1187                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1188                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1189                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1190             ]
1191         )
1192
1193         return {
1194             'id':           info.get('id'),
1195             'url':          compat_urllib_parse.unquote(info.get('url')),
1196             'uploader':     u'arte.tv',
1197             'upload_date':  info.get('date'),
1198             'title':        info.get('title').decode('utf-8'),
1199             'ext':          u'mp4',
1200             'format':       u'NA',
1201             'player_url':   None,
1202         }
1203
1204     def _real_extract(self, url):
1205         video_id = url.split('/')[-1]
1206         self.report_extraction(video_id)
1207
1208         if re.search(self._LIVE_URL, video_id) is not None:
1209             self.extractLiveStream(url)
1210             return
1211         else:
1212             info = self.extractPlus7Stream(url)
1213
1214         return [info]
1215
1216
1217 class GenericIE(InfoExtractor):
1218     """Generic last-resort information extractor."""
1219
1220     _VALID_URL = r'.*'
1221     IE_NAME = u'generic'
1222
1223     def __init__(self, downloader=None):
1224         InfoExtractor.__init__(self, downloader)
1225
1226     def report_download_webpage(self, video_id):
1227         """Report webpage download."""
1228         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1229         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1230
1231     def report_extraction(self, video_id):
1232         """Report information extraction."""
1233         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1234
1235     def report_following_redirect(self, new_url):
1236         """Report information extraction."""
1237         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1238
1239     def _test_redirect(self, url):
1240         """Check if it is a redirect, like url shorteners, in case restart chain."""
1241         class HeadRequest(compat_urllib_request.Request):
1242             def get_method(self):
1243                 return "HEAD"
1244
1245         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1246             """
1247             Subclass the HTTPRedirectHandler to make it use our
1248             HeadRequest also on the redirected URL
1249             """
1250             def redirect_request(self, req, fp, code, msg, headers, newurl):
1251                 if code in (301, 302, 303, 307):
1252                     newurl = newurl.replace(' ', '%20')
1253                     newheaders = dict((k,v) for k,v in req.headers.items()
1254                                       if k.lower() not in ("content-length", "content-type"))
1255                     return HeadRequest(newurl,
1256                                        headers=newheaders,
1257                                        origin_req_host=req.get_origin_req_host(),
1258                                        unverifiable=True)
1259                 else:
1260                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1261
1262         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1263             """
1264             Fallback to GET if HEAD is not allowed (405 HTTP error)
1265             """
1266             def http_error_405(self, req, fp, code, msg, headers):
1267                 fp.read()
1268                 fp.close()
1269
1270                 newheaders = dict((k,v) for k,v in req.headers.items()
1271                                   if k.lower() not in ("content-length", "content-type"))
1272                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1273                                                  headers=newheaders,
1274                                                  origin_req_host=req.get_origin_req_host(),
1275                                                  unverifiable=True))
1276
1277         # Build our opener
1278         opener = compat_urllib_request.OpenerDirector()
1279         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1280                         HTTPMethodFallback, HEADRedirectHandler,
1281                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1282             opener.add_handler(handler())
1283
1284         response = opener.open(HeadRequest(url))
1285         new_url = response.geturl()
1286
1287         if url == new_url:
1288             return False
1289
1290         self.report_following_redirect(new_url)
1291         self._downloader.download([new_url])
1292         return True
1293
1294     def _real_extract(self, url):
1295         if self._test_redirect(url): return
1296
1297         video_id = url.split('/')[-1]
1298         request = compat_urllib_request.Request(url)
1299         try:
1300             self.report_download_webpage(video_id)
1301             webpage = compat_urllib_request.urlopen(request).read()
1302         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1303             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1304             return
1305         except ValueError as err:
1306             # since this is the last-resort InfoExtractor, if
1307             # this error is thrown, it'll be thrown here
1308             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1309             return
1310
1311         self.report_extraction(video_id)
1312         # Start with something easy: JW Player in SWFObject
1313         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1314         if mobj is None:
1315             # Broaden the search a little bit
1316             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1317         if mobj is None:
1318             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1319             return
1320
1321         # It's possible that one of the regexes
1322         # matched, but returned an empty group:
1323         if mobj.group(1) is None:
1324             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1325             return
1326
1327         video_url = compat_urllib_parse.unquote(mobj.group(1))
1328         video_id = os.path.basename(video_url)
1329
1330         # here's a fun little line of code for you:
1331         video_extension = os.path.splitext(video_id)[1][1:]
1332         video_id = os.path.splitext(video_id)[0]
1333
1334         # it's tempting to parse this further, but you would
1335         # have to take into account all the variations like
1336         #   Video Title - Site Name
1337         #   Site Name | Video Title
1338         #   Video Title - Tagline | Site Name
1339         # and so on and so forth; it's just not practical
1340         mobj = re.search(r'<title>(.*)</title>', webpage)
1341         if mobj is None:
1342             self._downloader.trouble(u'ERROR: unable to extract title')
1343             return
1344         video_title = mobj.group(1)
1345
1346         # video uploader is domain name
1347         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1348         if mobj is None:
1349             self._downloader.trouble(u'ERROR: unable to extract title')
1350             return
1351         video_uploader = mobj.group(1)
1352
1353         return [{
1354             'id':       video_id,
1355             'url':      video_url,
1356             'uploader': video_uploader,
1357             'upload_date':  None,
1358             'title':    video_title,
1359             'ext':      video_extension,
1360         }]
1361
1362
1363 class YoutubeSearchIE(InfoExtractor):
1364     """Information Extractor for YouTube search queries."""
1365     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1366     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1367     _max_youtube_results = 1000
1368     IE_NAME = u'youtube:search'
1369
1370     def __init__(self, downloader=None):
1371         InfoExtractor.__init__(self, downloader)
1372
1373     def report_download_page(self, query, pagenum):
1374         """Report attempt to download search page with given number."""
1375         query = query.decode(preferredencoding())
1376         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1377
1378     def _real_extract(self, query):
1379         mobj = re.match(self._VALID_URL, query)
1380         if mobj is None:
1381             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1382             return
1383
1384         prefix, query = query.split(':')
1385         prefix = prefix[8:]
1386         query = query.encode('utf-8')
1387         if prefix == '':
1388             self._download_n_results(query, 1)
1389             return
1390         elif prefix == 'all':
1391             self._download_n_results(query, self._max_youtube_results)
1392             return
1393         else:
1394             try:
1395                 n = int(prefix)
1396                 if n <= 0:
1397                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1398                     return
1399                 elif n > self._max_youtube_results:
1400                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1401                     n = self._max_youtube_results
1402                 self._download_n_results(query, n)
1403                 return
1404             except ValueError: # parsing prefix as integer fails
1405                 self._download_n_results(query, 1)
1406                 return
1407
1408     def _download_n_results(self, query, n):
1409         """Downloads a specified number of results for a query"""
1410
1411         video_ids = []
1412         pagenum = 0
1413         limit = n
1414
1415         while (50 * pagenum) < limit:
1416             self.report_download_page(query, pagenum+1)
1417             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1418             request = compat_urllib_request.Request(result_url)
1419             try:
1420                 data = compat_urllib_request.urlopen(request).read()
1421             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1422                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1423                 return
1424             api_response = json.loads(data)['data']
1425
1426             new_ids = list(video['id'] for video in api_response['items'])
1427             video_ids += new_ids
1428
1429             limit = min(n, api_response['totalItems'])
1430             pagenum += 1
1431
1432         if len(video_ids) > n:
1433             video_ids = video_ids[:n]
1434         for id in video_ids:
1435             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1436         return
1437
1438
1439 class GoogleSearchIE(InfoExtractor):
1440     """Information Extractor for Google Video search queries."""
1441     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1442     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1443     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1444     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1445     _max_google_results = 1000
1446     IE_NAME = u'video.google:search'
1447
1448     def __init__(self, downloader=None):
1449         InfoExtractor.__init__(self, downloader)
1450
1451     def report_download_page(self, query, pagenum):
1452         """Report attempt to download playlist page with given number."""
1453         query = query.decode(preferredencoding())
1454         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1455
1456     def _real_extract(self, query):
1457         mobj = re.match(self._VALID_URL, query)
1458         if mobj is None:
1459             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1460             return
1461
1462         prefix, query = query.split(':')
1463         prefix = prefix[8:]
1464         query = query.encode('utf-8')
1465         if prefix == '':
1466             self._download_n_results(query, 1)
1467             return
1468         elif prefix == 'all':
1469             self._download_n_results(query, self._max_google_results)
1470             return
1471         else:
1472             try:
1473                 n = int(prefix)
1474                 if n <= 0:
1475                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1476                     return
1477                 elif n > self._max_google_results:
1478                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1479                     n = self._max_google_results
1480                 self._download_n_results(query, n)
1481                 return
1482             except ValueError: # parsing prefix as integer fails
1483                 self._download_n_results(query, 1)
1484                 return
1485
1486     def _download_n_results(self, query, n):
1487         """Downloads a specified number of results for a query"""
1488
1489         video_ids = []
1490         pagenum = 0
1491
1492         while True:
1493             self.report_download_page(query, pagenum)
1494             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1495             request = compat_urllib_request.Request(result_url)
1496             try:
1497                 page = compat_urllib_request.urlopen(request).read()
1498             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1499                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1500                 return
1501
1502             # Extract video identifiers
1503             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1504                 video_id = mobj.group(1)
1505                 if video_id not in video_ids:
1506                     video_ids.append(video_id)
1507                     if len(video_ids) == n:
1508                         # Specified n videos reached
1509                         for id in video_ids:
1510                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1511                         return
1512
1513             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1514                 for id in video_ids:
1515                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1516                 return
1517
1518             pagenum = pagenum + 1
1519
1520
1521 class YahooSearchIE(InfoExtractor):
1522     """Information Extractor for Yahoo! Video search queries."""
1523
1524     _WORKING = False
1525     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1526     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1527     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1528     _MORE_PAGES_INDICATOR = r'\s*Next'
1529     _max_yahoo_results = 1000
1530     IE_NAME = u'video.yahoo:search'
1531
1532     def __init__(self, downloader=None):
1533         InfoExtractor.__init__(self, downloader)
1534
1535     def report_download_page(self, query, pagenum):
1536         """Report attempt to download playlist page with given number."""
1537         query = query.decode(preferredencoding())
1538         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1539
1540     def _real_extract(self, query):
1541         mobj = re.match(self._VALID_URL, query)
1542         if mobj is None:
1543             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1544             return
1545
1546         prefix, query = query.split(':')
1547         prefix = prefix[8:]
1548         query = query.encode('utf-8')
1549         if prefix == '':
1550             self._download_n_results(query, 1)
1551             return
1552         elif prefix == 'all':
1553             self._download_n_results(query, self._max_yahoo_results)
1554             return
1555         else:
1556             try:
1557                 n = int(prefix)
1558                 if n <= 0:
1559                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1560                     return
1561                 elif n > self._max_yahoo_results:
1562                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1563                     n = self._max_yahoo_results
1564                 self._download_n_results(query, n)
1565                 return
1566             except ValueError: # parsing prefix as integer fails
1567                 self._download_n_results(query, 1)
1568                 return
1569
1570     def _download_n_results(self, query, n):
1571         """Downloads a specified number of results for a query"""
1572
1573         video_ids = []
1574         already_seen = set()
1575         pagenum = 1
1576
1577         while True:
1578             self.report_download_page(query, pagenum)
1579             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1580             request = compat_urllib_request.Request(result_url)
1581             try:
1582                 page = compat_urllib_request.urlopen(request).read()
1583             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1584                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1585                 return
1586
1587             # Extract video identifiers
1588             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1589                 video_id = mobj.group(1)
1590                 if video_id not in already_seen:
1591                     video_ids.append(video_id)
1592                     already_seen.add(video_id)
1593                     if len(video_ids) == n:
1594                         # Specified n videos reached
1595                         for id in video_ids:
1596                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1597                         return
1598
1599             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1600                 for id in video_ids:
1601                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1602                 return
1603
1604             pagenum = pagenum + 1
1605
1606
1607 class YoutubePlaylistIE(InfoExtractor):
1608     """Information Extractor for YouTube playlists."""
1609
1610     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1611     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1612     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1613     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1614     IE_NAME = u'youtube:playlist'
1615
1616     def __init__(self, downloader=None):
1617         InfoExtractor.__init__(self, downloader)
1618
1619     def report_download_page(self, playlist_id, pagenum):
1620         """Report attempt to download playlist page with given number."""
1621         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1622
1623     def _real_extract(self, url):
1624         # Extract playlist id
1625         mobj = re.match(self._VALID_URL, url)
1626         if mobj is None:
1627             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1628             return
1629
1630         # Single video case
1631         if mobj.group(3) is not None:
1632             self._downloader.download([mobj.group(3)])
1633             return
1634
1635         # Download playlist pages
1636         # prefix is 'p' as default for playlists but there are other types that need extra care
1637         playlist_prefix = mobj.group(1)
1638         if playlist_prefix == 'a':
1639             playlist_access = 'artist'
1640         else:
1641             playlist_prefix = 'p'
1642             playlist_access = 'view_play_list'
1643         playlist_id = mobj.group(2)
1644         video_ids = []
1645         pagenum = 1
1646
1647         while True:
1648             self.report_download_page(playlist_id, pagenum)
1649             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1650             request = compat_urllib_request.Request(url)
1651             try:
1652                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1653             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1654                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1655                 return
1656
1657             # Extract video identifiers
1658             ids_in_page = []
1659             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1660                 if mobj.group(1) not in ids_in_page:
1661                     ids_in_page.append(mobj.group(1))
1662             video_ids.extend(ids_in_page)
1663
1664             if self._MORE_PAGES_INDICATOR not in page:
1665                 break
1666             pagenum = pagenum + 1
1667
1668         total = len(video_ids)
1669
1670         playliststart = self._downloader.params.get('playliststart', 1) - 1
1671         playlistend = self._downloader.params.get('playlistend', -1)
1672         if playlistend == -1:
1673             video_ids = video_ids[playliststart:]
1674         else:
1675             video_ids = video_ids[playliststart:playlistend]
1676
1677         if len(video_ids) == total:
1678             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1679         else:
1680             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1681
1682         for id in video_ids:
1683             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1684         return
1685
1686
1687 class YoutubeChannelIE(InfoExtractor):
1688     """Information Extractor for YouTube channels."""
1689
1690     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1691     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1692     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1693     IE_NAME = u'youtube:channel'
1694
1695     def report_download_page(self, channel_id, pagenum):
1696         """Report attempt to download channel page with given number."""
1697         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1698
1699     def _real_extract(self, url):
1700         # Extract channel id
1701         mobj = re.match(self._VALID_URL, url)
1702         if mobj is None:
1703             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1704             return
1705
1706         # Download channel pages
1707         channel_id = mobj.group(1)
1708         video_ids = []
1709         pagenum = 1
1710
1711         while True:
1712             self.report_download_page(channel_id, pagenum)
1713             url = self._TEMPLATE_URL % (channel_id, pagenum)
1714             request = compat_urllib_request.Request(url)
1715             try:
1716                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1717             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1718                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1719                 return
1720
1721             # Extract video identifiers
1722             ids_in_page = []
1723             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1724                 if mobj.group(1) not in ids_in_page:
1725                     ids_in_page.append(mobj.group(1))
1726             video_ids.extend(ids_in_page)
1727
1728             if self._MORE_PAGES_INDICATOR not in page:
1729                 break
1730             pagenum = pagenum + 1
1731
1732         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1733
1734         for id in video_ids:
1735             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1736         return
1737
1738
1739 class YoutubeUserIE(InfoExtractor):
1740     """Information Extractor for YouTube users."""
1741
1742     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1743     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1744     _GDATA_PAGE_SIZE = 50
1745     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1746     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1747     IE_NAME = u'youtube:user'
1748
1749     def __init__(self, downloader=None):
1750         InfoExtractor.__init__(self, downloader)
1751
1752     def report_download_page(self, username, start_index):
1753         """Report attempt to download user page."""
1754         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1755                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1756
1757     def _real_extract(self, url):
1758         # Extract username
1759         mobj = re.match(self._VALID_URL, url)
1760         if mobj is None:
1761             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1762             return
1763
1764         username = mobj.group(1)
1765
1766         # Download video ids using YouTube Data API. Result size per
1767         # query is limited (currently to 50 videos) so we need to query
1768         # page by page until there are no video ids - it means we got
1769         # all of them.
1770
1771         video_ids = []
1772         pagenum = 0
1773
1774         while True:
1775             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1776             self.report_download_page(username, start_index)
1777
1778             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1779
1780             try:
1781                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1782             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1783                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1784                 return
1785
1786             # Extract video identifiers
1787             ids_in_page = []
1788
1789             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1790                 if mobj.group(1) not in ids_in_page:
1791                     ids_in_page.append(mobj.group(1))
1792
1793             video_ids.extend(ids_in_page)
1794
1795             # A little optimization - if current page is not
1796             # "full", ie. does not contain PAGE_SIZE video ids then
1797             # we can assume that this page is the last one - there
1798             # are no more ids on further pages - no need to query
1799             # again.
1800
1801             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1802                 break
1803
1804             pagenum += 1
1805
1806         all_ids_count = len(video_ids)
1807         playliststart = self._downloader.params.get('playliststart', 1) - 1
1808         playlistend = self._downloader.params.get('playlistend', -1)
1809
1810         if playlistend == -1:
1811             video_ids = video_ids[playliststart:]
1812         else:
1813             video_ids = video_ids[playliststart:playlistend]
1814
1815         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1816                 (username, all_ids_count, len(video_ids)))
1817
1818         for video_id in video_ids:
1819             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1820
1821
1822 class BlipTVUserIE(InfoExtractor):
1823     """Information Extractor for blip.tv users."""
1824
1825     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1826     _PAGE_SIZE = 12
1827     IE_NAME = u'blip.tv:user'
1828
1829     def __init__(self, downloader=None):
1830         InfoExtractor.__init__(self, downloader)
1831
1832     def report_download_page(self, username, pagenum):
1833         """Report attempt to download user page."""
1834         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1835                 (self.IE_NAME, username, pagenum))
1836
1837     def _real_extract(self, url):
1838         # Extract username
1839         mobj = re.match(self._VALID_URL, url)
1840         if mobj is None:
1841             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1842             return
1843
1844         username = mobj.group(1)
1845
1846         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1847
1848         request = compat_urllib_request.Request(url)
1849
1850         try:
1851             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1852             mobj = re.search(r'data-users-id="([^"]+)"', page)
1853             page_base = page_base % mobj.group(1)
1854         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1855             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1856             return
1857
1858
1859         # Download video ids using BlipTV Ajax calls. Result size per
1860         # query is limited (currently to 12 videos) so we need to query
1861         # page by page until there are no video ids - it means we got
1862         # all of them.
1863
1864         video_ids = []
1865         pagenum = 1
1866
1867         while True:
1868             self.report_download_page(username, pagenum)
1869
1870             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1871
1872             try:
1873                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1874             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1875                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1876                 return
1877
1878             # Extract video identifiers
1879             ids_in_page = []
1880
1881             for mobj in re.finditer(r'href="/([^"]+)"', page):
1882                 if mobj.group(1) not in ids_in_page:
1883                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1884
1885             video_ids.extend(ids_in_page)
1886
1887             # A little optimization - if current page is not
1888             # "full", ie. does not contain PAGE_SIZE video ids then
1889             # we can assume that this page is the last one - there
1890             # are no more ids on further pages - no need to query
1891             # again.
1892
1893             if len(ids_in_page) < self._PAGE_SIZE:
1894                 break
1895
1896             pagenum += 1
1897
1898         all_ids_count = len(video_ids)
1899         playliststart = self._downloader.params.get('playliststart', 1) - 1
1900         playlistend = self._downloader.params.get('playlistend', -1)
1901
1902         if playlistend == -1:
1903             video_ids = video_ids[playliststart:]
1904         else:
1905             video_ids = video_ids[playliststart:playlistend]
1906
1907         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1908                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1909
1910         for video_id in video_ids:
1911             self._downloader.download([u'http://blip.tv/'+video_id])
1912
1913
1914 class DepositFilesIE(InfoExtractor):
1915     """Information extractor for depositfiles.com"""
1916
1917     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1918
1919     def report_download_webpage(self, file_id):
1920         """Report webpage download."""
1921         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1922
1923     def report_extraction(self, file_id):
1924         """Report information extraction."""
1925         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1926
1927     def _real_extract(self, url):
1928         file_id = url.split('/')[-1]
1929         # Rebuild url in english locale
1930         url = 'http://depositfiles.com/en/files/' + file_id
1931
1932         # Retrieve file webpage with 'Free download' button pressed
1933         free_download_indication = { 'gateway_result' : '1' }
1934         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1935         try:
1936             self.report_download_webpage(file_id)
1937             webpage = compat_urllib_request.urlopen(request).read()
1938         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1939             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1940             return
1941
1942         # Search for the real file URL
1943         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1944         if (mobj is None) or (mobj.group(1) is None):
1945             # Try to figure out reason of the error.
1946             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1947             if (mobj is not None) and (mobj.group(1) is not None):
1948                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1949                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1950             else:
1951                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1952             return
1953
1954         file_url = mobj.group(1)
1955         file_extension = os.path.splitext(file_url)[1][1:]
1956
1957         # Search for file title
1958         mobj = re.search(r'<b title="(.*?)">', webpage)
1959         if mobj is None:
1960             self._downloader.trouble(u'ERROR: unable to extract title')
1961             return
1962         file_title = mobj.group(1).decode('utf-8')
1963
1964         return [{
1965             'id':       file_id.decode('utf-8'),
1966             'url':      file_url.decode('utf-8'),
1967             'uploader': None,
1968             'upload_date':  None,
1969             'title':    file_title,
1970             'ext':      file_extension.decode('utf-8'),
1971         }]
1972
1973
1974 class FacebookIE(InfoExtractor):
1975     """Information Extractor for Facebook"""
1976
1977     _WORKING = False
1978     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1979     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1980     _NETRC_MACHINE = 'facebook'
1981     _available_formats = ['video', 'highqual', 'lowqual']
1982     _video_extensions = {
1983         'video': 'mp4',
1984         'highqual': 'mp4',
1985         'lowqual': 'mp4',
1986     }
1987     IE_NAME = u'facebook'
1988
1989     def __init__(self, downloader=None):
1990         InfoExtractor.__init__(self, downloader)
1991
1992     def _reporter(self, message):
1993         """Add header and report message."""
1994         self._downloader.to_screen(u'[facebook] %s' % message)
1995
1996     def report_login(self):
1997         """Report attempt to log in."""
1998         self._reporter(u'Logging in')
1999
2000     def report_video_webpage_download(self, video_id):
2001         """Report attempt to download video webpage."""
2002         self._reporter(u'%s: Downloading video webpage' % video_id)
2003
2004     def report_information_extraction(self, video_id):
2005         """Report attempt to extract video information."""
2006         self._reporter(u'%s: Extracting video information' % video_id)
2007
2008     def _parse_page(self, video_webpage):
2009         """Extract video information from page"""
2010         # General data
2011         data = {'title': r'\("video_title", "(.*?)"\)',
2012             'description': r'<div class="datawrap">(.*?)</div>',
2013             'owner': r'\("video_owner_name", "(.*?)"\)',
2014             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2015             }
2016         video_info = {}
2017         for piece in data.keys():
2018             mobj = re.search(data[piece], video_webpage)
2019             if mobj is not None:
2020                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2021
2022         # Video urls
2023         video_urls = {}
2024         for fmt in self._available_formats:
2025             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2026             if mobj is not None:
2027                 # URL is in a Javascript segment inside an escaped Unicode format within
2028                 # the generally utf-8 page
2029                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2030         video_info['video_urls'] = video_urls
2031
2032         return video_info
2033
2034     def _real_initialize(self):
2035         if self._downloader is None:
2036             return
2037
2038         useremail = None
2039         password = None
2040         downloader_params = self._downloader.params
2041
2042         # Attempt to use provided username and password or .netrc data
2043         if downloader_params.get('username', None) is not None:
2044             useremail = downloader_params['username']
2045             password = downloader_params['password']
2046         elif downloader_params.get('usenetrc', False):
2047             try:
2048                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2049                 if info is not None:
2050                     useremail = info[0]
2051                     password = info[2]
2052                 else:
2053                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2054             except (IOError, netrc.NetrcParseError) as err:
2055                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2056                 return
2057
2058         if useremail is None:
2059             return
2060
2061         # Log in
2062         login_form = {
2063             'email': useremail,
2064             'pass': password,
2065             'login': 'Log+In'
2066             }
2067         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2068         try:
2069             self.report_login()
2070             login_results = compat_urllib_request.urlopen(request).read()
2071             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2072                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2073                 return
2074         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2075             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2076             return
2077
2078     def _real_extract(self, url):
2079         mobj = re.match(self._VALID_URL, url)
2080         if mobj is None:
2081             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2082             return
2083         video_id = mobj.group('ID')
2084
2085         # Get video webpage
2086         self.report_video_webpage_download(video_id)
2087         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2088         try:
2089             page = compat_urllib_request.urlopen(request)
2090             video_webpage = page.read()
2091         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2092             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2093             return
2094
2095         # Start extracting information
2096         self.report_information_extraction(video_id)
2097
2098         # Extract information
2099         video_info = self._parse_page(video_webpage)
2100
2101         # uploader
2102         if 'owner' not in video_info:
2103             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2104             return
2105         video_uploader = video_info['owner']
2106
2107         # title
2108         if 'title' not in video_info:
2109             self._downloader.trouble(u'ERROR: unable to extract video title')
2110             return
2111         video_title = video_info['title']
2112         video_title = video_title.decode('utf-8')
2113
2114         # thumbnail image
2115         if 'thumbnail' not in video_info:
2116             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2117             video_thumbnail = ''
2118         else:
2119             video_thumbnail = video_info['thumbnail']
2120
2121         # upload date
2122         upload_date = None
2123         if 'upload_date' in video_info:
2124             upload_time = video_info['upload_date']
2125             timetuple = email.utils.parsedate_tz(upload_time)
2126             if timetuple is not None:
2127                 try:
2128                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2129                 except:
2130                     pass
2131
2132         # description
2133         video_description = video_info.get('description', 'No description available.')
2134
2135         url_map = video_info['video_urls']
2136         if url_map:
2137             # Decide which formats to download
2138             req_format = self._downloader.params.get('format', None)
2139             format_limit = self._downloader.params.get('format_limit', None)
2140
2141             if format_limit is not None and format_limit in self._available_formats:
2142                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2143             else:
2144                 format_list = self._available_formats
2145             existing_formats = [x for x in format_list if x in url_map]
2146             if len(existing_formats) == 0:
2147                 self._downloader.trouble(u'ERROR: no known formats available for video')
2148                 return
2149             if req_format is None:
2150                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2151             elif req_format == 'worst':
2152                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2153             elif req_format == '-1':
2154                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2155             else:
2156                 # Specific format
2157                 if req_format not in url_map:
2158                     self._downloader.trouble(u'ERROR: requested format not available')
2159                     return
2160                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2161
2162         results = []
2163         for format_param, video_real_url in video_url_list:
2164             # Extension
2165             video_extension = self._video_extensions.get(format_param, 'mp4')
2166
2167             results.append({
2168                 'id':       video_id.decode('utf-8'),
2169                 'url':      video_real_url.decode('utf-8'),
2170                 'uploader': video_uploader.decode('utf-8'),
2171                 'upload_date':  upload_date,
2172                 'title':    video_title,
2173                 'ext':      video_extension.decode('utf-8'),
2174                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2175                 'thumbnail':    video_thumbnail.decode('utf-8'),
2176                 'description':  video_description.decode('utf-8'),
2177             })
2178         return results
2179
2180 class BlipTVIE(InfoExtractor):
2181     """Information extractor for blip.tv"""
2182
2183     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2184     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2185     IE_NAME = u'blip.tv'
2186
2187     def report_extraction(self, file_id):
2188         """Report information extraction."""
2189         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2190
2191     def report_direct_download(self, title):
2192         """Report information extraction."""
2193         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2194
2195     def _real_extract(self, url):
2196         mobj = re.match(self._VALID_URL, url)
2197         if mobj is None:
2198             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2199             return
2200
2201         if '?' in url:
2202             cchar = '&'
2203         else:
2204             cchar = '?'
2205         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2206         request = compat_urllib_request.Request(json_url)
2207         self.report_extraction(mobj.group(1))
2208         info = None
2209         try:
2210             urlh = compat_urllib_request.urlopen(request)
2211             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2212                 basename = url.split('/')[-1]
2213                 title,ext = os.path.splitext(basename)
2214                 title = title.decode('UTF-8')
2215                 ext = ext.replace('.', '')
2216                 self.report_direct_download(title)
2217                 info = {
2218                     'id': title,
2219                     'url': url,
2220                     'uploader': None,
2221                     'upload_date': None,
2222                     'title': title,
2223                     'ext': ext,
2224                     'urlhandle': urlh
2225                 }
2226         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2227             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2228             return
2229         if info is None: # Regular URL
2230             try:
2231                 json_code_bytes = urlh.read()
2232                 json_code = json_code_bytes.decode('utf-8')
2233             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2234                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2235                 return
2236
2237             try:
2238                 json_data = json.loads(json_code)
2239                 if 'Post' in json_data:
2240                     data = json_data['Post']
2241                 else:
2242                     data = json_data
2243
2244                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2245                 video_url = data['media']['url']
2246                 umobj = re.match(self._URL_EXT, video_url)
2247                 if umobj is None:
2248                     raise ValueError('Can not determine filename extension')
2249                 ext = umobj.group(1)
2250
2251                 info = {
2252                     'id': data['item_id'],
2253                     'url': video_url,
2254                     'uploader': data['display_name'],
2255                     'upload_date': upload_date,
2256                     'title': data['title'],
2257                     'ext': ext,
2258                     'format': data['media']['mimeType'],
2259                     'thumbnail': data['thumbnailUrl'],
2260                     'description': data['description'],
2261                     'player_url': data['embedUrl']
2262                 }
2263             except (ValueError,KeyError) as err:
2264                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2265                 return
2266
2267         std_headers['User-Agent'] = 'iTunes/10.6.1'
2268         return [info]
2269
2270
2271 class MyVideoIE(InfoExtractor):
2272     """Information Extractor for myvideo.de."""
2273
2274     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2275     IE_NAME = u'myvideo'
2276
2277     def __init__(self, downloader=None):
2278         InfoExtractor.__init__(self, downloader)
2279
2280     def report_extraction(self, video_id):
2281         """Report information extraction."""
2282         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2283
2284     def _real_extract(self,url):
2285         mobj = re.match(self._VALID_URL, url)
2286         if mobj is None:
2287             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2288             return
2289
2290         video_id = mobj.group(1)
2291
2292         # Get video webpage
2293         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2294         webpage = self._download_webpage(webpage_url, video_id)
2295
2296         self.report_extraction(video_id)
2297         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2298                  webpage)
2299         if mobj is None:
2300             self._downloader.trouble(u'ERROR: unable to extract media URL')
2301             return
2302         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2303
2304         mobj = re.search('<title>([^<]+)</title>', webpage)
2305         if mobj is None:
2306             self._downloader.trouble(u'ERROR: unable to extract title')
2307             return
2308
2309         video_title = mobj.group(1)
2310
2311         return [{
2312             'id':       video_id,
2313             'url':      video_url,
2314             'uploader': None,
2315             'upload_date':  None,
2316             'title':    video_title,
2317             'ext':      u'flv',
2318         }]
2319
2320 class ComedyCentralIE(InfoExtractor):
2321     """Information extractor for The Daily Show and Colbert Report """
2322
2323     # urls can be abbreviations like :thedailyshow or :colbert
2324     # urls for episodes like:
2325     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2326     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2327     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2328     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2329                       |(https?://)?(www\.)?
2330                           (?P<showname>thedailyshow|colbertnation)\.com/
2331                          (full-episodes/(?P<episode>.*)|
2332                           (?P<clip>
2333                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2334                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2335                      $"""
2336     IE_NAME = u'comedycentral'
2337
2338     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2339
2340     _video_extensions = {
2341         '3500': 'mp4',
2342         '2200': 'mp4',
2343         '1700': 'mp4',
2344         '1200': 'mp4',
2345         '750': 'mp4',
2346         '400': 'mp4',
2347     }
2348     _video_dimensions = {
2349         '3500': '1280x720',
2350         '2200': '960x540',
2351         '1700': '768x432',
2352         '1200': '640x360',
2353         '750': '512x288',
2354         '400': '384x216',
2355     }
2356
2357     def suitable(self, url):
2358         """Receives a URL and returns True if suitable for this IE."""
2359         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2360
2361     def report_extraction(self, episode_id):
2362         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2363
2364     def report_config_download(self, episode_id):
2365         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2366
2367     def report_index_download(self, episode_id):
2368         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2369
2370     def report_player_url(self, episode_id):
2371         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2372
2373
2374     def _print_formats(self, formats):
2375         print('Available formats:')
2376         for x in formats:
2377             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2378
2379
2380     def _real_extract(self, url):
2381         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2382         if mobj is None:
2383             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2384             return
2385
2386         if mobj.group('shortname'):
2387             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2388                 url = u'http://www.thedailyshow.com/full-episodes/'
2389             else:
2390                 url = u'http://www.colbertnation.com/full-episodes/'
2391             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2392             assert mobj is not None
2393
2394         if mobj.group('clip'):
2395             if mobj.group('showname') == 'thedailyshow':
2396                 epTitle = mobj.group('tdstitle')
2397             else:
2398                 epTitle = mobj.group('cntitle')
2399             dlNewest = False
2400         else:
2401             dlNewest = not mobj.group('episode')
2402             if dlNewest:
2403                 epTitle = mobj.group('showname')
2404             else:
2405                 epTitle = mobj.group('episode')
2406
2407         req = compat_urllib_request.Request(url)
2408         self.report_extraction(epTitle)
2409         try:
2410             htmlHandle = compat_urllib_request.urlopen(req)
2411             html = htmlHandle.read()
2412         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2413             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2414             return
2415         if dlNewest:
2416             url = htmlHandle.geturl()
2417             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2418             if mobj is None:
2419                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2420                 return
2421             if mobj.group('episode') == '':
2422                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2423                 return
2424             epTitle = mobj.group('episode')
2425
2426         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2427
2428         if len(mMovieParams) == 0:
2429             # The Colbert Report embeds the information in a without
2430             # a URL prefix; so extract the alternate reference
2431             # and then add the URL prefix manually.
2432
2433             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2434             if len(altMovieParams) == 0:
2435                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2436                 return
2437             else:
2438                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2439
2440         playerUrl_raw = mMovieParams[0][0]
2441         self.report_player_url(epTitle)
2442         try:
2443             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2444             playerUrl = urlHandle.geturl()
2445         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2446             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2447             return
2448
2449         uri = mMovieParams[0][1]
2450         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2451         self.report_index_download(epTitle)
2452         try:
2453             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2454         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2455             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2456             return
2457
2458         results = []
2459
2460         idoc = xml.etree.ElementTree.fromstring(indexXml)
2461         itemEls = idoc.findall('.//item')
2462         for itemEl in itemEls:
2463             mediaId = itemEl.findall('./guid')[0].text
2464             shortMediaId = mediaId.split(':')[-1]
2465             showId = mediaId.split(':')[-2].replace('.com', '')
2466             officialTitle = itemEl.findall('./title')[0].text
2467             officialDate = itemEl.findall('./pubDate')[0].text
2468
2469             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2470                         compat_urllib_parse.urlencode({'uri': mediaId}))
2471             configReq = compat_urllib_request.Request(configUrl)
2472             self.report_config_download(epTitle)
2473             try:
2474                 configXml = compat_urllib_request.urlopen(configReq).read()
2475             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2476                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2477                 return
2478
2479             cdoc = xml.etree.ElementTree.fromstring(configXml)
2480             turls = []
2481             for rendition in cdoc.findall('.//rendition'):
2482                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2483                 turls.append(finfo)
2484
2485             if len(turls) == 0:
2486                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2487                 continue
2488
2489             if self._downloader.params.get('listformats', None):
2490                 self._print_formats([i[0] for i in turls])
2491                 return
2492
2493             # For now, just pick the highest bitrate
2494             format,video_url = turls[-1]
2495
2496             # Get the format arg from the arg stream
2497             req_format = self._downloader.params.get('format', None)
2498
2499             # Select format if we can find one
2500             for f,v in turls:
2501                 if f == req_format:
2502                     format, video_url = f, v
2503                     break
2504
2505             # Patch to download from alternative CDN, which does not
2506             # break on current RTMPDump builds
2507             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2508             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2509
2510             if video_url.startswith(broken_cdn):
2511                 video_url = video_url.replace(broken_cdn, better_cdn)
2512
2513             effTitle = showId + u'-' + epTitle
2514             info = {
2515                 'id': shortMediaId,
2516                 'url': video_url,
2517                 'uploader': showId,
2518                 'upload_date': officialDate,
2519                 'title': effTitle,
2520                 'ext': 'mp4',
2521                 'format': format,
2522                 'thumbnail': None,
2523                 'description': officialTitle,
2524                 'player_url': None #playerUrl
2525             }
2526
2527             results.append(info)
2528
2529         return results
2530
2531
2532 class EscapistIE(InfoExtractor):
2533     """Information extractor for The Escapist """
2534
2535     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2536     IE_NAME = u'escapist'
2537
2538     def report_extraction(self, showName):
2539         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2540
2541     def report_config_download(self, showName):
2542         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2543
2544     def _real_extract(self, url):
2545         mobj = re.match(self._VALID_URL, url)
2546         if mobj is None:
2547             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2548             return
2549         showName = mobj.group('showname')
2550         videoId = mobj.group('episode')
2551
2552         self.report_extraction(showName)
2553         try:
2554             webPage = compat_urllib_request.urlopen(url)
2555             webPageBytes = webPage.read()
2556             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2557             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2558         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2559             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2560             return
2561
2562         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2563         description = unescapeHTML(descMatch.group(1))
2564         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2565         imgUrl = unescapeHTML(imgMatch.group(1))
2566         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2567         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2568         configUrlMatch = re.search('config=(.*)$', playerUrl)
2569         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2570
2571         self.report_config_download(showName)
2572         try:
2573             configJSON = compat_urllib_request.urlopen(configUrl)
2574             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2575             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2576         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2577             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2578             return
2579
2580         # Technically, it's JavaScript, not JSON
2581         configJSON = configJSON.replace("'", '"')
2582
2583         try:
2584             config = json.loads(configJSON)
2585         except (ValueError,) as err:
2586             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2587             return
2588
2589         playlist = config['playlist']
2590         videoUrl = playlist[1]['url']
2591
2592         info = {
2593             'id': videoId,
2594             'url': videoUrl,
2595             'uploader': showName,
2596             'upload_date': None,
2597             'title': showName,
2598             'ext': 'flv',
2599             'thumbnail': imgUrl,
2600             'description': description,
2601             'player_url': playerUrl,
2602         }
2603
2604         return [info]
2605
2606
2607 class CollegeHumorIE(InfoExtractor):
2608     """Information extractor for collegehumor.com"""
2609
2610     _WORKING = False
2611     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2612     IE_NAME = u'collegehumor'
2613
2614     def report_manifest(self, video_id):
2615         """Report information extraction."""
2616         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2617
2618     def report_extraction(self, video_id):
2619         """Report information extraction."""
2620         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2621
2622     def _real_extract(self, url):
2623         mobj = re.match(self._VALID_URL, url)
2624         if mobj is None:
2625             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2626             return
2627         video_id = mobj.group('videoid')
2628
2629         info = {
2630             'id': video_id,
2631             'uploader': None,
2632             'upload_date': None,
2633         }
2634
2635         self.report_extraction(video_id)
2636         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2637         try:
2638             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2639         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2640             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2641             return
2642
2643         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2644         try:
2645             videoNode = mdoc.findall('./video')[0]
2646             info['description'] = videoNode.findall('./description')[0].text
2647             info['title'] = videoNode.findall('./caption')[0].text
2648             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2649             manifest_url = videoNode.findall('./file')[0].text
2650         except IndexError:
2651             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2652             return
2653
2654         manifest_url += '?hdcore=2.10.3'
2655         self.report_manifest(video_id)
2656         try:
2657             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2658         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2659             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2660             return
2661
2662         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2663         try:
2664             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2665             node_id = media_node.attrib['url']
2666             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2667         except IndexError as err:
2668             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2669             return
2670
2671         url_pr = compat_urllib_parse_urlparse(manifest_url)
2672         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2673
2674         info['url'] = url
2675         info['ext'] = 'f4f'
2676         return [info]
2677
2678
2679 class XVideosIE(InfoExtractor):
2680     """Information extractor for xvideos.com"""
2681
2682     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2683     IE_NAME = u'xvideos'
2684
2685     def report_extraction(self, video_id):
2686         """Report information extraction."""
2687         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2688
2689     def _real_extract(self, url):
2690         mobj = re.match(self._VALID_URL, url)
2691         if mobj is None:
2692             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2693             return
2694         video_id = mobj.group(1)
2695
2696         webpage = self._download_webpage(url, video_id)
2697
2698         self.report_extraction(video_id)
2699
2700
2701         # Extract video URL
2702         mobj = re.search(r'flv_url=(.+?)&', webpage)
2703         if mobj is None:
2704             self._downloader.trouble(u'ERROR: unable to extract video url')
2705             return
2706         video_url = compat_urllib_parse.unquote(mobj.group(1))
2707
2708
2709         # Extract title
2710         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2711         if mobj is None:
2712             self._downloader.trouble(u'ERROR: unable to extract video title')
2713             return
2714         video_title = mobj.group(1)
2715
2716
2717         # Extract video thumbnail
2718         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2719         if mobj is None:
2720             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2721             return
2722         video_thumbnail = mobj.group(0)
2723
2724         info = {
2725             'id': video_id,
2726             'url': video_url,
2727             'uploader': None,
2728             'upload_date': None,
2729             'title': video_title,
2730             'ext': 'flv',
2731             'thumbnail': video_thumbnail,
2732             'description': None,
2733         }
2734
2735         return [info]
2736
2737
2738 class SoundcloudIE(InfoExtractor):
2739     """Information extractor for soundcloud.com
2740        To access the media, the uid of the song and a stream token
2741        must be extracted from the page source and the script must make
2742        a request to media.soundcloud.com/crossdomain.xml. Then
2743        the media can be grabbed by requesting from an url composed
2744        of the stream token and uid
2745      """
2746
2747     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2748     IE_NAME = u'soundcloud'
2749
2750     def __init__(self, downloader=None):
2751         InfoExtractor.__init__(self, downloader)
2752
2753     def report_resolve(self, video_id):
2754         """Report information extraction."""
2755         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2756
2757     def report_extraction(self, video_id):
2758         """Report information extraction."""
2759         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2760
2761     def _real_extract(self, url):
2762         mobj = re.match(self._VALID_URL, url)
2763         if mobj is None:
2764             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2765             return
2766
2767         # extract uploader (which is in the url)
2768         uploader = mobj.group(1)
2769         # extract simple title (uploader + slug of song title)
2770         slug_title =  mobj.group(2)
2771         simple_title = uploader + u'-' + slug_title
2772
2773         self.report_resolve('%s/%s' % (uploader, slug_title))
2774
2775         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2776         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2777         request = compat_urllib_request.Request(resolv_url)
2778         try:
2779             info_json_bytes = compat_urllib_request.urlopen(request).read()
2780             info_json = info_json_bytes.decode('utf-8')
2781         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2782             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2783             return
2784
2785         info = json.loads(info_json)
2786         video_id = info['id']
2787         self.report_extraction('%s/%s' % (uploader, slug_title))
2788
2789         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2790         request = compat_urllib_request.Request(streams_url)
2791         try:
2792             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2793             stream_json = stream_json_bytes.decode('utf-8')
2794         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2795             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2796             return
2797
2798         streams = json.loads(stream_json)
2799         mediaURL = streams['http_mp3_128_url']
2800
2801         return [{
2802             'id':       info['id'],
2803             'url':      mediaURL,
2804             'uploader': info['user']['username'],
2805             'upload_date':  info['created_at'],
2806             'title':    info['title'],
2807             'ext':      u'mp3',
2808             'description': info['description'],
2809         }]
2810
2811
2812 class InfoQIE(InfoExtractor):
2813     """Information extractor for infoq.com"""
2814     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2815
2816     def report_extraction(self, video_id):
2817         """Report information extraction."""
2818         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2819
2820     def _real_extract(self, url):
2821         mobj = re.match(self._VALID_URL, url)
2822         if mobj is None:
2823             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2824             return
2825
2826         webpage = self._download_webpage(url, video_id=url)
2827         self.report_extraction(url)
2828
2829         # Extract video URL
2830         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2831         if mobj is None:
2832             self._downloader.trouble(u'ERROR: unable to extract video url')
2833             return
2834         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2835         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2836
2837         # Extract title
2838         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2839         if mobj is None:
2840             self._downloader.trouble(u'ERROR: unable to extract video title')
2841             return
2842         video_title = mobj.group(1)
2843
2844         # Extract description
2845         video_description = u'No description available.'
2846         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2847         if mobj is not None:
2848             video_description = mobj.group(1)
2849
2850         video_filename = video_url.split('/')[-1]
2851         video_id, extension = video_filename.split('.')
2852
2853         info = {
2854             'id': video_id,
2855             'url': video_url,
2856             'uploader': None,
2857             'upload_date': None,
2858             'title': video_title,
2859             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2860             'thumbnail': None,
2861             'description': video_description,
2862         }
2863
2864         return [info]
2865
2866 class MixcloudIE(InfoExtractor):
2867     """Information extractor for www.mixcloud.com"""
2868
2869     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2870     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2871     IE_NAME = u'mixcloud'
2872
2873     def __init__(self, downloader=None):
2874         InfoExtractor.__init__(self, downloader)
2875
2876     def report_download_json(self, file_id):
2877         """Report JSON download."""
2878         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2879
2880     def report_extraction(self, file_id):
2881         """Report information extraction."""
2882         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2883
2884     def get_urls(self, jsonData, fmt, bitrate='best'):
2885         """Get urls from 'audio_formats' section in json"""
2886         file_url = None
2887         try:
2888             bitrate_list = jsonData[fmt]
2889             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2890                 bitrate = max(bitrate_list) # select highest
2891
2892             url_list = jsonData[fmt][bitrate]
2893         except TypeError: # we have no bitrate info.
2894             url_list = jsonData[fmt]
2895         return url_list
2896
2897     def check_urls(self, url_list):
2898         """Returns 1st active url from list"""
2899         for url in url_list:
2900             try:
2901                 compat_urllib_request.urlopen(url)
2902                 return url
2903             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2904                 url = None
2905
2906         return None
2907
2908     def _print_formats(self, formats):
2909         print('Available formats:')
2910         for fmt in formats.keys():
2911             for b in formats[fmt]:
2912                 try:
2913                     ext = formats[fmt][b][0]
2914                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2915                 except TypeError: # we have no bitrate info
2916                     ext = formats[fmt][0]
2917                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2918                     break
2919
2920     def _real_extract(self, url):
2921         mobj = re.match(self._VALID_URL, url)
2922         if mobj is None:
2923             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2924             return
2925         # extract uploader & filename from url
2926         uploader = mobj.group(1).decode('utf-8')
2927         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2928
2929         # construct API request
2930         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2931         # retrieve .json file with links to files
2932         request = compat_urllib_request.Request(file_url)
2933         try:
2934             self.report_download_json(file_url)
2935             jsonData = compat_urllib_request.urlopen(request).read()
2936         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2937             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2938             return
2939
2940         # parse JSON
2941         json_data = json.loads(jsonData)
2942         player_url = json_data['player_swf_url']
2943         formats = dict(json_data['audio_formats'])
2944
2945         req_format = self._downloader.params.get('format', None)
2946         bitrate = None
2947
2948         if self._downloader.params.get('listformats', None):
2949             self._print_formats(formats)
2950             return
2951
2952         if req_format is None or req_format == 'best':
2953             for format_param in formats.keys():
2954                 url_list = self.get_urls(formats, format_param)
2955                 # check urls
2956                 file_url = self.check_urls(url_list)
2957                 if file_url is not None:
2958                     break # got it!
2959         else:
2960             if req_format not in formats:
2961                 self._downloader.trouble(u'ERROR: format is not available')
2962                 return
2963
2964             url_list = self.get_urls(formats, req_format)
2965             file_url = self.check_urls(url_list)
2966             format_param = req_format
2967
2968         return [{
2969             'id': file_id.decode('utf-8'),
2970             'url': file_url.decode('utf-8'),
2971             'uploader': uploader.decode('utf-8'),
2972             'upload_date': None,
2973             'title': json_data['name'],
2974             'ext': file_url.split('.')[-1].decode('utf-8'),
2975             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2976             'thumbnail': json_data['thumbnail_url'],
2977             'description': json_data['description'],
2978             'player_url': player_url.decode('utf-8'),
2979         }]
2980
2981 class StanfordOpenClassroomIE(InfoExtractor):
2982     """Information extractor for Stanford's Open ClassRoom"""
2983
2984     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2985     IE_NAME = u'stanfordoc'
2986
2987     def report_download_webpage(self, objid):
2988         """Report information extraction."""
2989         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2990
2991     def report_extraction(self, video_id):
2992         """Report information extraction."""
2993         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2994
2995     def _real_extract(self, url):
2996         mobj = re.match(self._VALID_URL, url)
2997         if mobj is None:
2998             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2999             return
3000
3001         if mobj.group('course') and mobj.group('video'): # A specific video
3002             course = mobj.group('course')
3003             video = mobj.group('video')
3004             info = {
3005                 'id': course + '_' + video,
3006                 'uploader': None,
3007                 'upload_date': None,
3008             }
3009
3010             self.report_extraction(info['id'])
3011             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3012             xmlUrl = baseUrl + video + '.xml'
3013             try:
3014                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3015             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3016                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3017                 return
3018             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3019             try:
3020                 info['title'] = mdoc.findall('./title')[0].text
3021                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3022             except IndexError:
3023                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3024                 return
3025             info['ext'] = info['url'].rpartition('.')[2]
3026             return [info]
3027         elif mobj.group('course'): # A course page
3028             course = mobj.group('course')
3029             info = {
3030                 'id': course,
3031                 'type': 'playlist',
3032                 'uploader': None,
3033                 'upload_date': None,
3034             }
3035
3036             self.report_download_webpage(info['id'])
3037             try:
3038                 coursepage = compat_urllib_request.urlopen(url).read()
3039             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3040                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3041                 return
3042
3043             m = re.search('<h1>([^<]+)</h1>', coursepage)
3044             if m:
3045                 info['title'] = unescapeHTML(m.group(1))
3046             else:
3047                 info['title'] = info['id']
3048
3049             m = re.search('<description>([^<]+)</description>', coursepage)
3050             if m:
3051                 info['description'] = unescapeHTML(m.group(1))
3052
3053             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3054             info['list'] = [
3055                 {
3056                     'type': 'reference',
3057                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3058                 }
3059                     for vpage in links]
3060             results = []
3061             for entry in info['list']:
3062                 assert entry['type'] == 'reference'
3063                 results += self.extract(entry['url'])
3064             return results
3065
3066         else: # Root page
3067             info = {
3068                 'id': 'Stanford OpenClassroom',
3069                 'type': 'playlist',
3070                 'uploader': None,
3071                 'upload_date': None,
3072             }
3073
3074             self.report_download_webpage(info['id'])
3075             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3076             try:
3077                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3078             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3079                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3080                 return
3081
3082             info['title'] = info['id']
3083
3084             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3085             info['list'] = [
3086                 {
3087                     'type': 'reference',
3088                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3089                 }
3090                     for cpage in links]
3091
3092             results = []
3093             for entry in info['list']:
3094                 assert entry['type'] == 'reference'
3095                 results += self.extract(entry['url'])
3096             return results
3097
3098 class MTVIE(InfoExtractor):
3099     """Information extractor for MTV.com"""
3100
3101     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3102     IE_NAME = u'mtv'
3103
3104     def report_extraction(self, video_id):
3105         """Report information extraction."""
3106         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3107
3108     def _real_extract(self, url):
3109         mobj = re.match(self._VALID_URL, url)
3110         if mobj is None:
3111             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3112             return
3113         if not mobj.group('proto'):
3114             url = 'http://' + url
3115         video_id = mobj.group('videoid')
3116
3117         webpage = self._download_webpage(url, video_id)
3118
3119         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3120         if mobj is None:
3121             self._downloader.trouble(u'ERROR: unable to extract song name')
3122             return
3123         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3124         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3125         if mobj is None:
3126             self._downloader.trouble(u'ERROR: unable to extract performer')
3127             return
3128         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3129         video_title = performer + ' - ' + song_name
3130
3131         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3132         if mobj is None:
3133             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3134             return
3135         mtvn_uri = mobj.group(1)
3136
3137         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3138         if mobj is None:
3139             self._downloader.trouble(u'ERROR: unable to extract content id')
3140             return
3141         content_id = mobj.group(1)
3142
3143         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3144         self.report_extraction(video_id)
3145         request = compat_urllib_request.Request(videogen_url)
3146         try:
3147             metadataXml = compat_urllib_request.urlopen(request).read()
3148         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3149             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3150             return
3151
3152         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3153         renditions = mdoc.findall('.//rendition')
3154
3155         # For now, always pick the highest quality.
3156         rendition = renditions[-1]
3157
3158         try:
3159             _,_,ext = rendition.attrib['type'].partition('/')
3160             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3161             video_url = rendition.find('./src').text
3162         except KeyError:
3163             self._downloader.trouble('Invalid rendition field.')
3164             return
3165
3166         info = {
3167             'id': video_id,
3168             'url': video_url,
3169             'uploader': performer,
3170             'upload_date': None,
3171             'title': video_title,
3172             'ext': ext,
3173             'format': format,
3174         }
3175
3176         return [info]
3177
3178
3179 class YoukuIE(InfoExtractor):
3180     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3181
3182     def report_download_webpage(self, file_id):
3183         """Report webpage download."""
3184         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3185
3186     def report_extraction(self, file_id):
3187         """Report information extraction."""
3188         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3189
3190     def _gen_sid(self):
3191         nowTime = int(time.time() * 1000)
3192         random1 = random.randint(1000,1998)
3193         random2 = random.randint(1000,9999)
3194
3195         return "%d%d%d" %(nowTime,random1,random2)
3196
3197     def _get_file_ID_mix_string(self, seed):
3198         mixed = []
3199         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3200         seed = float(seed)
3201         for i in range(len(source)):
3202             seed  =  (seed * 211 + 30031 ) % 65536
3203             index  =  math.floor(seed / 65536 * len(source) )
3204             mixed.append(source[int(index)])
3205             source.remove(source[int(index)])
3206         #return ''.join(mixed)
3207         return mixed
3208
3209     def _get_file_id(self, fileId, seed):
3210         mixed = self._get_file_ID_mix_string(seed)
3211         ids = fileId.split('*')
3212         realId = []
3213         for ch in ids:
3214             if ch:
3215                 realId.append(mixed[int(ch)])
3216         return ''.join(realId)
3217
3218     def _real_extract(self, url):
3219         mobj = re.match(self._VALID_URL, url)
3220         if mobj is None:
3221             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3222             return
3223         video_id = mobj.group('ID')
3224
3225         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3226
3227         request = compat_urllib_request.Request(info_url, None, std_headers)
3228         try:
3229             self.report_download_webpage(video_id)
3230             jsondata = compat_urllib_request.urlopen(request).read()
3231         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3232             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3233             return
3234
3235         self.report_extraction(video_id)
3236         try:
3237             jsonstr = jsondata.decode('utf-8')
3238             config = json.loads(jsonstr)
3239
3240             video_title =  config['data'][0]['title']
3241             seed = config['data'][0]['seed']
3242
3243             format = self._downloader.params.get('format', None)
3244             supported_format = list(config['data'][0]['streamfileids'].keys())
3245
3246             if format is None or format == 'best':
3247                 if 'hd2' in supported_format:
3248                     format = 'hd2'
3249                 else:
3250                     format = 'flv'
3251                 ext = u'flv'
3252             elif format == 'worst':
3253                 format = 'mp4'
3254                 ext = u'mp4'
3255             else:
3256                 format = 'flv'
3257                 ext = u'flv'
3258
3259
3260             fileid = config['data'][0]['streamfileids'][format]
3261             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3262         except (UnicodeDecodeError, ValueError, KeyError):
3263             self._downloader.trouble(u'ERROR: unable to extract info section')
3264             return
3265
3266         files_info=[]
3267         sid = self._gen_sid()
3268         fileid = self._get_file_id(fileid, seed)
3269
3270         #column 8,9 of fileid represent the segment number
3271         #fileid[7:9] should be changed
3272         for index, key in enumerate(keys):
3273
3274             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3275             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3276
3277             info = {
3278                 'id': '%s_part%02d' % (video_id, index),
3279                 'url': download_url,
3280                 'uploader': None,
3281                 'upload_date': None,
3282                 'title': video_title,
3283                 'ext': ext,
3284             }
3285             files_info.append(info)
3286
3287         return files_info
3288
3289
3290 class XNXXIE(InfoExtractor):
3291     """Information extractor for xnxx.com"""
3292
3293     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3294     IE_NAME = u'xnxx'
3295     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3296     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3297     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3298
3299     def report_webpage(self, video_id):
3300         """Report information extraction"""
3301         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3302
3303     def report_extraction(self, video_id):
3304         """Report information extraction"""
3305         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3306
3307     def _real_extract(self, url):
3308         mobj = re.match(self._VALID_URL, url)
3309         if mobj is None:
3310             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3311             return
3312         video_id = mobj.group(1)
3313
3314         self.report_webpage(video_id)
3315
3316         # Get webpage content
3317         try:
3318             webpage_bytes = compat_urllib_request.urlopen(url).read()
3319             webpage = webpage_bytes.decode('utf-8')
3320         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3321             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3322             return
3323
3324         result = re.search(self.VIDEO_URL_RE, webpage)
3325         if result is None:
3326             self._downloader.trouble(u'ERROR: unable to extract video url')
3327             return
3328         video_url = compat_urllib_parse.unquote(result.group(1))
3329
3330         result = re.search(self.VIDEO_TITLE_RE, webpage)
3331         if result is None:
3332             self._downloader.trouble(u'ERROR: unable to extract video title')
3333             return
3334         video_title = result.group(1)
3335
3336         result = re.search(self.VIDEO_THUMB_RE, webpage)
3337         if result is None:
3338             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3339             return
3340         video_thumbnail = result.group(1)
3341
3342         return [{
3343             'id': video_id,
3344             'url': video_url,
3345             'uploader': None,
3346             'upload_date': None,
3347             'title': video_title,
3348             'ext': 'flv',
3349             'thumbnail': video_thumbnail,
3350             'description': None,
3351         }]
3352
3353
3354 class GooglePlusIE(InfoExtractor):
3355     """Information extractor for plus.google.com."""
3356
3357     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3358     IE_NAME = u'plus.google'
3359
3360     def __init__(self, downloader=None):
3361         InfoExtractor.__init__(self, downloader)
3362
3363     def report_extract_entry(self, url):
3364         """Report downloading extry"""
3365         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3366
3367     def report_date(self, upload_date):
3368         """Report downloading extry"""
3369         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3370
3371     def report_uploader(self, uploader):
3372         """Report downloading extry"""
3373         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3374
3375     def report_title(self, video_title):
3376         """Report downloading extry"""
3377         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3378
3379     def report_extract_vid_page(self, video_page):
3380         """Report information extraction."""
3381         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3382
3383     def _real_extract(self, url):
3384         # Extract id from URL
3385         mobj = re.match(self._VALID_URL, url)
3386         if mobj is None:
3387             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3388             return
3389
3390         post_url = mobj.group(0)
3391         video_id = mobj.group(1)
3392
3393         video_extension = 'flv'
3394
3395         # Step 1, Retrieve post webpage to extract further information
3396         self.report_extract_entry(post_url)
3397         request = compat_urllib_request.Request(post_url)
3398         try:
3399             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3400         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3401             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3402             return
3403
3404         # Extract update date
3405         upload_date = None
3406         pattern = 'title="Timestamp">(.*?)</a>'
3407         mobj = re.search(pattern, webpage)
3408         if mobj:
3409             upload_date = mobj.group(1)
3410             # Convert timestring to a format suitable for filename
3411             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3412             upload_date = upload_date.strftime('%Y%m%d')
3413         self.report_date(upload_date)
3414
3415         # Extract uploader
3416         uploader = None
3417         pattern = r'rel\="author".*?>(.*?)</a>'
3418         mobj = re.search(pattern, webpage)
3419         if mobj:
3420             uploader = mobj.group(1)
3421         self.report_uploader(uploader)
3422
3423         # Extract title
3424         # Get the first line for title
3425         video_title = u'NA'
3426         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3427         mobj = re.search(pattern, webpage)
3428         if mobj:
3429             video_title = mobj.group(1)
3430         self.report_title(video_title)
3431
3432         # Step 2, Stimulate clicking the image box to launch video
3433         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3434         mobj = re.search(pattern, webpage)
3435         if mobj is None:
3436             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3437
3438         video_page = mobj.group(1)
3439         request = compat_urllib_request.Request(video_page)
3440         try:
3441             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3442         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3443             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3444             return
3445         self.report_extract_vid_page(video_page)
3446
3447
3448         # Extract video links on video page
3449         """Extract video links of all sizes"""
3450         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3451         mobj = re.findall(pattern, webpage)
3452         if len(mobj) == 0:
3453             self._downloader.trouble(u'ERROR: unable to extract video links')
3454
3455         # Sort in resolution
3456         links = sorted(mobj)
3457
3458         # Choose the lowest of the sort, i.e. highest resolution
3459         video_url = links[-1]
3460         # Only get the url. The resolution part in the tuple has no use anymore
3461         video_url = video_url[-1]
3462         # Treat escaped \u0026 style hex
3463         try:
3464             video_url = video_url.decode("unicode_escape")
3465         except AttributeError: # Python 3
3466             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3467
3468
3469         return [{
3470             'id':       video_id,
3471             'url':      video_url,
3472             'uploader': uploader,
3473             'upload_date':  upload_date,
3474             'title':    video_title,
3475             'ext':      video_extension,
3476         }]
3477
3478 class NBAIE(InfoExtractor):
3479     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3480     IE_NAME = u'nba'
3481
3482     def _real_extract(self, url):
3483         mobj = re.match(self._VALID_URL, url)
3484         if mobj is None:
3485             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3486             return
3487
3488         video_id = mobj.group(1)
3489         if video_id.endswith('/index.html'):
3490             video_id = video_id[:-len('/index.html')]
3491
3492         webpage = self._download_webpage(url, video_id)
3493
3494         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3495         def _findProp(rexp, default=None):
3496             m = re.search(rexp, webpage)
3497             if m:
3498                 return unescapeHTML(m.group(1))
3499             else:
3500                 return default
3501
3502         shortened_video_id = video_id.rpartition('/')[2]
3503         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3504         info = {
3505             'id': shortened_video_id,
3506             'url': video_url,
3507             'ext': 'mp4',
3508             'title': title,
3509             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3510             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3511         }
3512         return [info]
3513
3514 class JustinTVIE(InfoExtractor):
3515     """Information extractor for justin.tv and twitch.tv"""
3516     # TODO: One broadcast may be split into multiple videos. The key
3517     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3518     # starts at 1 and increases. Can we treat all parts as one video?
3519
3520     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3521         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3522     _JUSTIN_PAGE_LIMIT = 100
3523     IE_NAME = u'justin.tv'
3524
3525     def report_extraction(self, file_id):
3526         """Report information extraction."""
3527         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3528
3529     def report_download_page(self, channel, offset):
3530         """Report attempt to download a single page of videos."""
3531         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3532                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3533
3534     # Return count of items, list of *valid* items
3535     def _parse_page(self, url):
3536         try:
3537             urlh = compat_urllib_request.urlopen(url)
3538             webpage_bytes = urlh.read()
3539             webpage = webpage_bytes.decode('utf-8', 'ignore')
3540         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3541             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3542             return
3543
3544         response = json.loads(webpage)
3545         info = []
3546         for clip in response:
3547             video_url = clip['video_file_url']
3548             if video_url:
3549                 video_extension = os.path.splitext(video_url)[1][1:]
3550                 video_date = re.sub('-', '', clip['created_on'][:10])
3551                 info.append({
3552                     'id': clip['id'],
3553                     'url': video_url,
3554                     'title': clip['title'],
3555                     'uploader': clip.get('user_id', clip.get('channel_id')),
3556                     'upload_date': video_date,
3557                     'ext': video_extension,
3558                 })
3559         return (len(response), info)
3560
3561     def _real_extract(self, url):
3562         mobj = re.match(self._VALID_URL, url)
3563         if mobj is None:
3564             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3565             return
3566
3567         api = 'http://api.justin.tv'
3568         video_id = mobj.group(mobj.lastindex)
3569         paged = False
3570         if mobj.lastindex == 1:
3571             paged = True
3572             api += '/channel/archives/%s.json'
3573         else:
3574             api += '/clip/show/%s.json'
3575         api = api % (video_id,)
3576
3577         self.report_extraction(video_id)
3578
3579         info = []
3580         offset = 0
3581         limit = self._JUSTIN_PAGE_LIMIT
3582         while True:
3583             if paged:
3584                 self.report_download_page(video_id, offset)
3585             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3586             page_count, page_info = self._parse_page(page_url)
3587             info.extend(page_info)
3588             if not paged or page_count != limit:
3589                 break
3590             offset += limit
3591         return info
3592
3593 class FunnyOrDieIE(InfoExtractor):
3594     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3595
3596     def _real_extract(self, url):
3597         mobj = re.match(self._VALID_URL, url)
3598         if mobj is None:
3599             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3600             return
3601
3602         video_id = mobj.group('id')
3603         webpage = self._download_webpage(url, video_id)
3604
3605         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3606         if not m:
3607             self._downloader.trouble(u'ERROR: unable to find video information')
3608         video_url = unescapeHTML(m.group('url'))
3609
3610         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3611         if not m:
3612             self._downloader.trouble(u'Cannot find video title')
3613         title = unescapeHTML(m.group('title'))
3614
3615         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3616         if m:
3617             desc = unescapeHTML(m.group('desc'))
3618         else:
3619             desc = None
3620
3621         info = {
3622             'id': video_id,
3623             'url': video_url,
3624             'ext': 'mp4',
3625             'title': title,
3626             'description': desc,
3627         }
3628         return [info]
3629
3630 class TweetReelIE(InfoExtractor):
3631     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3632
3633     def _real_extract(self, url):
3634         mobj = re.match(self._VALID_URL, url)
3635         if mobj is None:
3636             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3637             return
3638
3639         video_id = mobj.group('id')
3640         webpage = self._download_webpage(url, video_id)
3641
3642         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3643         if not m:
3644             self._downloader.trouble(u'ERROR: Cannot find status ID')
3645         status_id = m.group(1)
3646
3647         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3648         if not m:
3649             self._downloader.trouble(u'WARNING: Cannot find description')
3650         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3651
3652         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3653         if not m:
3654             self._downloader.trouble(u'ERROR: Cannot find uploader')
3655         uploader = unescapeHTML(m.group('uploader'))
3656         uploader_id = unescapeHTML(m.group('uploader_id'))
3657
3658         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3659         if not m:
3660             self._downloader.trouble(u'ERROR: Cannot find upload date')
3661         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3662
3663         title = desc
3664         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3665
3666         info = {
3667             'id': video_id,
3668             'url': video_url,
3669             'ext': 'mov',
3670             'title': title,
3671             'description': desc,
3672             'uploader': uploader,
3673             'uploader_id': uploader_id,
3674             'internal_id': status_id,
3675             'upload_date': upload_date
3676         }
3677         return [info]
3678
3679 class SteamIE(InfoExtractor):
3680     _VALID_URL = r"""http://store.steampowered.com/
3681                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3682                 (?P<gameID>\d+)/?
3683                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3684                 """
3685
3686     def suitable(self, url):
3687         """Receives a URL and returns True if suitable for this IE."""
3688         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3689
3690     def _real_extract(self, url):
3691         m = re.match(self._VALID_URL, url, re.VERBOSE)
3692         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3693         gameID = m.group('gameID')
3694         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3695         webpage = self._download_webpage(videourl, gameID)
3696         mweb = re.finditer(urlRE, webpage)
3697         namesRE = r'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
3698         titles = list(re.finditer(namesRE, webpage))
3699         videos = []
3700         for vid,vtitle in zip(mweb,titles):
3701             video_id = vid.group('videoID')
3702             title = vtitle.group('videoName')
3703             video_url = vid.group('videoURL')
3704             if not video_url:
3705                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3706             info = {
3707                 'id':video_id,
3708                 'url':video_url,
3709                 'ext': 'flv',
3710                 'title': title
3711                   }
3712             videos.append(info)
3713         return videos
3714
3715 class UstreamIE(InfoExtractor):
3716     _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3717     IE_NAME = u'ustream'
3718
3719     def _real_extract(self, url):
3720         m = re.match(self._VALID_URL, url)
3721         video_id = m.group('videoID')
3722         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3723         webpage = self._download_webpage(url, video_id)
3724         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3725         title = m.group('title')
3726         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3727         uploader = m.group('uploader')
3728         info = {
3729                 'id':video_id,
3730                 'url':video_url,
3731                 'ext': 'flv',
3732                 'title': title,
3733                 'uploader': uploader
3734                   }
3735         return [info]
3736
3737
3738 def gen_extractors():
3739     """ Return a list of an instance of every supported extractor.
3740     The order does matter; the first extractor matched is the one handling the URL.
3741     """
3742     return [
3743         YoutubePlaylistIE(),
3744         YoutubeChannelIE(),
3745         YoutubeUserIE(),
3746         YoutubeSearchIE(),
3747         YoutubeIE(),
3748         MetacafeIE(),
3749         DailymotionIE(),
3750         GoogleSearchIE(),
3751         PhotobucketIE(),
3752         YahooIE(),
3753         YahooSearchIE(),
3754         DepositFilesIE(),
3755         FacebookIE(),
3756         BlipTVUserIE(),
3757         BlipTVIE(),
3758         VimeoIE(),
3759         MyVideoIE(),
3760         ComedyCentralIE(),
3761         EscapistIE(),
3762         CollegeHumorIE(),
3763         XVideosIE(),
3764         SoundcloudIE(),
3765         InfoQIE(),
3766         MixcloudIE(),
3767         StanfordOpenClassroomIE(),
3768         MTVIE(),
3769         YoukuIE(),
3770         XNXXIE(),
3771         GooglePlusIE(),
3772         ArteTvIE(),
3773         NBAIE(),
3774         JustinTVIE(),
3775         FunnyOrDieIE(),
3776         TweetReelIE(),
3777         SteamIE(),
3778         UstreamIE(),
3779         GenericIE()
3780     ]
3781
3782