youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 195         """
 196         Perform a regex search on the given string, using a single or a list of
 197         patterns returning the first matching group.
 198         In case of failure return a default value or raise a WARNING or a
 199         ExtractorError, depending on fatal, specifying the field name.
 200         """
 201         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 202             mobj = re.search(pattern, string, flags)
 203         else:
 204             for p in pattern:
 205                 mobj = re.search(p, string, flags)
 206                 if mobj: break
 207
 208         if sys.stderr.isatty() and os.name != 'nt':
 209             _name = u'\033[0;34m%s\033[0m' % name
 210         else:
 211             _name = name
 212
 213         if mobj:
 214             # return the first matching group
 215             return next(g for g in mobj.groups() if g is not None)
 216         elif default is not None:
 217             return default
 218         elif fatal:
 219             raise ExtractorError(u'Unable to extract %s' % _name)
 220         else:
 221             self._downloader.report_warning(u'unable to extract %s; '
 222                 u'please report this issue on GitHub.' % _name)
 223             return None
 224
 225     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 226         """
 227         Like _search_regex, but strips HTML tags and unescapes entities.
 228         """
 229         res = self._search_regex(pattern, string, name, default, fatal, flags)
 230         if res:
 231             return clean_html(res).strip()
 232         else:
 233             return res
 234
 235 class SearchInfoExtractor(InfoExtractor):
 236     """
 237     Base class for paged search queries extractors.
 238     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 239     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 240     """
 241
 242     @classmethod
 243     def _make_valid_url(cls):
 244         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 245
 246     @classmethod
 247     def suitable(cls, url):
 248         return re.match(cls._make_valid_url(), url) is not None
 249
 250     def _real_extract(self, query):
 251         mobj = re.match(self._make_valid_url(), query)
 252         if mobj is None:
 253             raise ExtractorError(u'Invalid search query "%s"' % query)
 254
 255         prefix = mobj.group('prefix')
 256         query = mobj.group('query')
 257         if prefix == '':
 258             return self._get_n_results(query, 1)
 259         elif prefix == 'all':
 260             return self._get_n_results(query, self._MAX_RESULTS)
 261         else:
 262             n = int(prefix)
 263             if n <= 0:
 264                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 265             elif n > self._MAX_RESULTS:
 266                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 267                 n = self._MAX_RESULTS
 268             return self._get_n_results(query, n)
 269
 270     def _get_n_results(self, query, n):
 271         """Get a specified number of results for a query"""
 272         raise NotImplementedError("This method must be implemented by sublclasses")
 273
 274
 275 class YoutubeIE(InfoExtractor):
 276     """Information extractor for youtube.com."""
 277
 278     _VALID_URL = r"""^
 279                      (
 280                          (?:https?://)?                                       # http(s):// (optional)
 281                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 282                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 283                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 284                          (?:                                                  # the various things that can precede the ID:
 285                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 286                              |(?:                                             # or the v= param in all its forms
 287                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 288                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 289                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 290                                  v=
 291                              )
 292                          )?                                                   # optional -> youtube.com/xxxx is OK
 293                      )?                                                       # all until now is optional -> you can pass the naked ID
 294                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 295                      (?(1).+)?                                                # if we found the ID, everything can follow
 296                      $"""
 297     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 298     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 299     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 300     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 301     _NETRC_MACHINE = 'youtube'
 302     # Listed in order of quality
 303     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 304     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 305     _video_extensions = {
 306         '13': '3gp',
 307         '17': 'mp4',
 308         '18': 'mp4',
 309         '22': 'mp4',
 310         '37': 'mp4',
 311         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 312         '43': 'webm',
 313         '44': 'webm',
 314         '45': 'webm',
 315         '46': 'webm',
 316     }
 317     _video_dimensions = {
 318         '5': '240x400',
 319         '6': '???',
 320         '13': '???',
 321         '17': '144x176',
 322         '18': '360x640',
 323         '22': '720x1280',
 324         '34': '360x640',
 325         '35': '480x854',
 326         '37': '1080x1920',
 327         '38': '3072x4096',
 328         '43': '360x640',
 329         '44': '480x854',
 330         '45': '720x1280',
 331         '46': '1080x1920',
 332     }
 333     IE_NAME = u'youtube'
 334
 335     @classmethod
 336     def suitable(cls, url):
 337         """Receives a URL and returns True if suitable for this IE."""
 338         if YoutubePlaylistIE.suitable(url): return False
 339         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 340
 341     def report_lang(self):
 342         """Report attempt to set language."""
 343         self.to_screen(u'Setting language')
 344
 345     def report_login(self):
 346         """Report attempt to log in."""
 347         self.to_screen(u'Logging in')
 348
 349     def report_video_webpage_download(self, video_id):
 350         """Report attempt to download video webpage."""
 351         self.to_screen(u'%s: Downloading video webpage' % video_id)
 352
 353     def report_video_info_webpage_download(self, video_id):
 354         """Report attempt to download video info webpage."""
 355         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 356
 357     def report_video_subtitles_download(self, video_id):
 358         """Report attempt to download video info webpage."""
 359         self.to_screen(u'%s: Checking available subtitles' % video_id)
 360
 361     def report_video_subtitles_request(self, video_id, sub_lang, format):
 362         """Report attempt to download video info webpage."""
 363         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 364
 365     def report_video_subtitles_available(self, video_id, sub_lang_list):
 366         """Report available subtitles."""
 367         sub_lang = ",".join(list(sub_lang_list.keys()))
 368         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 369
 370     def report_information_extraction(self, video_id):
 371         """Report attempt to extract video information."""
 372         self.to_screen(u'%s: Extracting video information' % video_id)
 373
 374     def report_unavailable_format(self, video_id, format):
 375         """Report extracted video URL."""
 376         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 377
 378     def report_rtmp_download(self):
 379         """Indicate the download will use the RTMP protocol."""
 380         self.to_screen(u'RTMP download detected')
 381
 382     def _get_available_subtitles(self, video_id):
 383         self.report_video_subtitles_download(video_id)
 384         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 385         try:
 386             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 388             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 389         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 390         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 391         if not sub_lang_list:
 392             return (u'video doesn\'t have subtitles', None)
 393         return sub_lang_list
 394
 395     def _list_available_subtitles(self, video_id):
 396         sub_lang_list = self._get_available_subtitles(video_id)
 397         self.report_video_subtitles_available(video_id, sub_lang_list)
 398
 399     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 400         """
 401         Return tuple:
 402         (error_message, sub_lang, sub)
 403         """
 404         self.report_video_subtitles_request(video_id, sub_lang, format)
 405         params = compat_urllib_parse.urlencode({
 406             'lang': sub_lang,
 407             'name': sub_name,
 408             'v': video_id,
 409             'fmt': format,
 410         })
 411         url = 'http://www.youtube.com/api/timedtext?' + params
 412         try:
 413             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 416         if not sub:
 417             return (u'Did not fetch video subtitles', None, None)
 418         return (None, sub_lang, sub)
 419
 420     def _request_automatic_caption(self, video_id, webpage):
 421         """We need the webpage for getting the captions url, pass it as an
 422            argument to speed up the process."""
 423         sub_lang = self._downloader.params.get('subtitleslang')
 424         sub_format = self._downloader.params.get('subtitlesformat')
 425         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 426         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 427         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 428         if mobj is None:
 429             return [(err_msg, None, None)]
 430         player_config = json.loads(mobj.group(1))
 431         try:
 432             args = player_config[u'args']
 433             caption_url = args[u'ttsurl']
 434             timestamp = args[u'timestamp']
 435             params = compat_urllib_parse.urlencode({
 436                 'lang': 'en',
 437                 'tlang': sub_lang,
 438                 'fmt': sub_format,
 439                 'ts': timestamp,
 440                 'kind': 'asr',
 441             })
 442             subtitles_url = caption_url + '&' + params
 443             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 444             return [(None, sub_lang, sub)]
 445         except KeyError:
 446             return [(err_msg, None, None)]
 447
 448     def _extract_subtitle(self, video_id):
 449         """
 450         Return a list with a tuple:
 451         [(error_message, sub_lang, sub)]
 452         """
 453         sub_lang_list = self._get_available_subtitles(video_id)
 454         sub_format = self._downloader.params.get('subtitlesformat')
 455         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 456             return [(sub_lang_list[0], None, None)]
 457         if self._downloader.params.get('subtitleslang', False):
 458             sub_lang = self._downloader.params.get('subtitleslang')
 459         elif 'en' in sub_lang_list:
 460             sub_lang = 'en'
 461         else:
 462             sub_lang = list(sub_lang_list.keys())[0]
 463         if not sub_lang in sub_lang_list:
 464             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 465
 466         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 467         return [subtitle]
 468
 469     def _extract_all_subtitles(self, video_id):
 470         sub_lang_list = self._get_available_subtitles(video_id)
 471         sub_format = self._downloader.params.get('subtitlesformat')
 472         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 473             return [(sub_lang_list[0], None, None)]
 474         subtitles = []
 475         for sub_lang in sub_lang_list:
 476             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 477             subtitles.append(subtitle)
 478         return subtitles
 479
 480     def _print_formats(self, formats):
 481         print('Available formats:')
 482         for x in formats:
 483             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 484
 485     def _real_initialize(self):
 486         if self._downloader is None:
 487             return
 488
 489         username = None
 490         password = None
 491         downloader_params = self._downloader.params
 492
 493         # Attempt to use provided username and password or .netrc data
 494         if downloader_params.get('username', None) is not None:
 495             username = downloader_params['username']
 496             password = downloader_params['password']
 497         elif downloader_params.get('usenetrc', False):
 498             try:
 499                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 500                 if info is not None:
 501                     username = info[0]
 502                     password = info[2]
 503                 else:
 504                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 505             except (IOError, netrc.NetrcParseError) as err:
 506                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 507                 return
 508
 509         # Set language
 510         request = compat_urllib_request.Request(self._LANG_URL)
 511         try:
 512             self.report_lang()
 513             compat_urllib_request.urlopen(request).read()
 514         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 515             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 516             return
 517
 518         # No authentication to be performed
 519         if username is None:
 520             return
 521
 522         request = compat_urllib_request.Request(self._LOGIN_URL)
 523         try:
 524             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 525         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 526             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 527             return
 528
 529         galx = None
 530         dsh = None
 531         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 532         if match:
 533           galx = match.group(1)
 534
 535         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 536         if match:
 537           dsh = match.group(1)
 538
 539         # Log in
 540         login_form_strs = {
 541                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 542                 u'Email': username,
 543                 u'GALX': galx,
 544                 u'Passwd': password,
 545                 u'PersistentCookie': u'yes',
 546                 u'_utf8': u'霱',
 547                 u'bgresponse': u'js_disabled',
 548                 u'checkConnection': u'',
 549                 u'checkedDomains': u'youtube',
 550                 u'dnConn': u'',
 551                 u'dsh': dsh,
 552                 u'pstMsg': u'0',
 553                 u'rmShown': u'1',
 554                 u'secTok': u'',
 555                 u'signIn': u'Sign in',
 556                 u'timeStmp': u'',
 557                 u'service': u'youtube',
 558                 u'uilel': u'3',
 559                 u'hl': u'en_US',
 560         }
 561         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 562         # chokes on unicode
 563         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 564         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 565         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 566         try:
 567             self.report_login()
 568             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 569             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 570                 self._downloader.report_warning(u'unable to log in: bad username or password')
 571                 return
 572         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 573             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 574             return
 575
 576         # Confirm age
 577         age_form = {
 578                 'next_url':     '/',
 579                 'action_confirm':   'Confirm',
 580                 }
 581         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 582         try:
 583             self.report_age_confirmation()
 584             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 585         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 586             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 587
 588     def _extract_id(self, url):
 589         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 590         if mobj is None:
 591             raise ExtractorError(u'Invalid URL: %s' % url)
 592         video_id = mobj.group(2)
 593         return video_id
 594
 595     def _real_extract(self, url):
 596         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 597         mobj = re.search(self._NEXT_URL_RE, url)
 598         if mobj:
 599             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 600         video_id = self._extract_id(url)
 601
 602         # Get video webpage
 603         self.report_video_webpage_download(video_id)
 604         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 605         request = compat_urllib_request.Request(url)
 606         try:
 607             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 608         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 609             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 610
 611         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 612
 613         # Attempt to extract SWF player URL
 614         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 615         if mobj is not None:
 616             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 617         else:
 618             player_url = None
 619
 620         # Get video info
 621         self.report_video_info_webpage_download(video_id)
 622         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 623             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 624                     % (video_id, el_type))
 625             video_info_webpage = self._download_webpage(video_info_url, video_id,
 626                                     note=False,
 627                                     errnote='unable to download video info webpage')
 628             video_info = compat_parse_qs(video_info_webpage)
 629             if 'token' in video_info:
 630                 break
 631         if 'token' not in video_info:
 632             if 'reason' in video_info:
 633                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 634             else:
 635                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 636
 637         # Check for "rental" videos
 638         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 639             raise ExtractorError(u'"rental" videos not supported')
 640
 641         # Start extracting information
 642         self.report_information_extraction(video_id)
 643
 644         # uploader
 645         if 'author' not in video_info:
 646             raise ExtractorError(u'Unable to extract uploader name')
 647         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 648
 649         # uploader_id
 650         video_uploader_id = None
 651         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 652         if mobj is not None:
 653             video_uploader_id = mobj.group(1)
 654         else:
 655             self._downloader.report_warning(u'unable to extract uploader nickname')
 656
 657         # title
 658         if 'title' not in video_info:
 659             raise ExtractorError(u'Unable to extract video title')
 660         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 661
 662         # thumbnail image
 663         if 'thumbnail_url' not in video_info:
 664             self._downloader.report_warning(u'unable to extract video thumbnail')
 665             video_thumbnail = ''
 666         else:   # don't panic if we can't find it
 667             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 668
 669         # upload date
 670         upload_date = None
 671         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 672         if mobj is not None:
 673             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 674             upload_date = unified_strdate(upload_date)
 675
 676         # description
 677         video_description = get_element_by_id("eow-description", video_webpage)
 678         if video_description:
 679             video_description = clean_html(video_description)
 680         else:
 681             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 682             if fd_mobj:
 683                 video_description = unescapeHTML(fd_mobj.group(1))
 684             else:
 685                 video_description = u''
 686
 687         # subtitles
 688         video_subtitles = None
 689
 690         if self._downloader.params.get('writesubtitles', False):
 691             video_subtitles = self._extract_subtitle(video_id)
 692             if video_subtitles:
 693                 (sub_error, sub_lang, sub) = video_subtitles[0]
 694                 if sub_error:
 695                     # We try with the automatic captions
 696                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 697                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 698                     if sub is not None:
 699                         pass
 700                     else:
 701                         # We report the original error
 702                         self._downloader.report_error(sub_error)
 703
 704         if self._downloader.params.get('allsubtitles', False):
 705             video_subtitles = self._extract_all_subtitles(video_id)
 706             for video_subtitle in video_subtitles:
 707                 (sub_error, sub_lang, sub) = video_subtitle
 708                 if sub_error:
 709                     self._downloader.report_error(sub_error)
 710
 711         if self._downloader.params.get('listsubtitles', False):
 712             sub_lang_list = self._list_available_subtitles(video_id)
 713             return
 714
 715         if 'length_seconds' not in video_info:
 716             self._downloader.report_warning(u'unable to extract video duration')
 717             video_duration = ''
 718         else:
 719             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 720
 721         # token
 722         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 723
 724         # Decide which formats to download
 725         req_format = self._downloader.params.get('format', None)
 726
 727         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 728             self.report_rtmp_download()
 729             video_url_list = [(None, video_info['conn'][0])]
 730         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 731             url_map = {}
 732             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 733                 url_data = compat_parse_qs(url_data_str)
 734                 if 'itag' in url_data and 'url' in url_data:
 735                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 736                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 737                     url_map[url_data['itag'][0]] = url
 738
 739             format_limit = self._downloader.params.get('format_limit', None)
 740             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 741             if format_limit is not None and format_limit in available_formats:
 742                 format_list = available_formats[available_formats.index(format_limit):]
 743             else:
 744                 format_list = available_formats
 745             existing_formats = [x for x in format_list if x in url_map]
 746             if len(existing_formats) == 0:
 747                 raise ExtractorError(u'no known formats available for video')
 748             if self._downloader.params.get('listformats', None):
 749                 self._print_formats(existing_formats)
 750                 return
 751             if req_format is None or req_format == 'best':
 752                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 753             elif req_format == 'worst':
 754                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 755             elif req_format in ('-1', 'all'):
 756                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 757             else:
 758                 # Specific formats. We pick the first in a slash-delimeted sequence.
 759                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 760                 req_formats = req_format.split('/')
 761                 video_url_list = None
 762                 for rf in req_formats:
 763                     if rf in url_map:
 764                         video_url_list = [(rf, url_map[rf])]
 765                         break
 766                 if video_url_list is None:
 767                     raise ExtractorError(u'requested format not available')
 768         else:
 769             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 770
 771         results = []
 772         for format_param, video_real_url in video_url_list:
 773             # Extension
 774             video_extension = self._video_extensions.get(format_param, 'flv')
 775
 776             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 777                                               self._video_dimensions.get(format_param, '???'))
 778
 779             results.append({
 780                 'id':       video_id,
 781                 'url':      video_real_url,
 782                 'uploader': video_uploader,
 783                 'uploader_id': video_uploader_id,
 784                 'upload_date':  upload_date,
 785                 'title':    video_title,
 786                 'ext':      video_extension,
 787                 'format':   video_format,
 788                 'thumbnail':    video_thumbnail,
 789                 'description':  video_description,
 790                 'player_url':   player_url,
 791                 'subtitles':    video_subtitles,
 792                 'duration':     video_duration
 793             })
 794         return results
 795
 796
 797 class MetacafeIE(InfoExtractor):
 798     """Information Extractor for metacafe.com."""
 799
 800     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 801     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 802     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 803     IE_NAME = u'metacafe'
 804
 805     def report_disclaimer(self):
 806         """Report disclaimer retrieval."""
 807         self.to_screen(u'Retrieving disclaimer')
 808
 809     def _real_initialize(self):
 810         # Retrieve disclaimer
 811         request = compat_urllib_request.Request(self._DISCLAIMER)
 812         try:
 813             self.report_disclaimer()
 814             disclaimer = compat_urllib_request.urlopen(request).read()
 815         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 816             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 817
 818         # Confirm age
 819         disclaimer_form = {
 820             'filters': '0',
 821             'submit': "Continue - I'm over 18",
 822             }
 823         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 824         try:
 825             self.report_age_confirmation()
 826             disclaimer = compat_urllib_request.urlopen(request).read()
 827         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 828             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 829
 830     def _real_extract(self, url):
 831         # Extract id and simplified title from URL
 832         mobj = re.match(self._VALID_URL, url)
 833         if mobj is None:
 834             raise ExtractorError(u'Invalid URL: %s' % url)
 835
 836         video_id = mobj.group(1)
 837
 838         # Check if video comes from YouTube
 839         mobj2 = re.match(r'^yt-(.*)$', video_id)
 840         if mobj2 is not None:
 841             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 842
 843         # Retrieve video webpage to extract further information
 844         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 845
 846         # Extract URL, uploader and title from webpage
 847         self.report_extraction(video_id)
 848         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 849         if mobj is not None:
 850             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 851             video_extension = mediaURL[-3:]
 852
 853             # Extract gdaKey if available
 854             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 855             if mobj is None:
 856                 video_url = mediaURL
 857             else:
 858                 gdaKey = mobj.group(1)
 859                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 860         else:
 861             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 862             if mobj is None:
 863                 raise ExtractorError(u'Unable to extract media URL')
 864             vardict = compat_parse_qs(mobj.group(1))
 865             if 'mediaData' not in vardict:
 866                 raise ExtractorError(u'Unable to extract media URL')
 867             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 868             if mobj is None:
 869                 raise ExtractorError(u'Unable to extract media URL')
 870             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 871             video_extension = mediaURL[-3:]
 872             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 873
 874         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 875         if mobj is None:
 876             raise ExtractorError(u'Unable to extract title')
 877         video_title = mobj.group(1).decode('utf-8')
 878
 879         mobj = re.search(r'submitter=(.*?);', webpage)
 880         if mobj is None:
 881             raise ExtractorError(u'Unable to extract uploader nickname')
 882         video_uploader = mobj.group(1)
 883
 884         return [{
 885             'id':       video_id.decode('utf-8'),
 886             'url':      video_url.decode('utf-8'),
 887             'uploader': video_uploader.decode('utf-8'),
 888             'upload_date':  None,
 889             'title':    video_title,
 890             'ext':      video_extension.decode('utf-8'),
 891         }]
 892
 893 class DailymotionIE(InfoExtractor):
 894     """Information Extractor for Dailymotion"""
 895
 896     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 897     IE_NAME = u'dailymotion'
 898
 899     def _real_extract(self, url):
 900         # Extract id and simplified title from URL
 901         mobj = re.match(self._VALID_URL, url)
 902         if mobj is None:
 903             raise ExtractorError(u'Invalid URL: %s' % url)
 904
 905         video_id = mobj.group(1).split('_')[0].split('?')[0]
 906
 907         video_extension = 'mp4'
 908
 909         # Retrieve video webpage to extract further information
 910         request = compat_urllib_request.Request(url)
 911         request.add_header('Cookie', 'family_filter=off')
 912         webpage = self._download_webpage(request, video_id)
 913
 914         # Extract URL, uploader and title from webpage
 915         self.report_extraction(video_id)
 916         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 917         if mobj is None:
 918             raise ExtractorError(u'Unable to extract media URL')
 919         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 920
 921         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 922             if key in flashvars:
 923                 max_quality = key
 924                 self.to_screen(u'Using %s' % key)
 925                 break
 926         else:
 927             raise ExtractorError(u'Unable to extract video URL')
 928
 929         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 930         if mobj is None:
 931             raise ExtractorError(u'Unable to extract video URL')
 932
 933         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 934
 935         # TODO: support choosing qualities
 936
 937         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 938         if mobj is None:
 939             raise ExtractorError(u'Unable to extract title')
 940         video_title = unescapeHTML(mobj.group('title'))
 941
 942         video_uploader = None
 943         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 944         if mobj is None:
 945             # lookin for official user
 946             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 947             if mobj_official is None:
 948                 self._downloader.report_warning(u'unable to extract uploader nickname')
 949             else:
 950                 video_uploader = mobj_official.group(1)
 951         else:
 952             video_uploader = mobj.group(1)
 953
 954         video_upload_date = None
 955         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 956         if mobj is not None:
 957             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 958
 959         return [{
 960             'id':       video_id,
 961             'url':      video_url,
 962             'uploader': video_uploader,
 963             'upload_date':  video_upload_date,
 964             'title':    video_title,
 965             'ext':      video_extension,
 966         }]
 967
 968
 969 class PhotobucketIE(InfoExtractor):
 970     """Information extractor for photobucket.com."""
 971
 972     # TODO: the original _VALID_URL was:
 973     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 974     # Check if it's necessary to keep the old extracion process
 975     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 976     IE_NAME = u'photobucket'
 977
 978     def _real_extract(self, url):
 979         # Extract id from URL
 980         mobj = re.match(self._VALID_URL, url)
 981         if mobj is None:
 982             raise ExtractorError(u'Invalid URL: %s' % url)
 983
 984         video_id = mobj.group('id')
 985
 986         video_extension = mobj.group('ext')
 987
 988         # Retrieve video webpage to extract further information
 989         webpage = self._download_webpage(url, video_id)
 990
 991         # Extract URL, uploader, and title from webpage
 992         self.report_extraction(video_id)
 993         # We try first by looking the javascript code:
 994         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 995         if mobj is not None:
 996             info = json.loads(mobj.group('json'))
 997             return [{
 998                 'id':       video_id,
 999                 'url':      info[u'downloadUrl'],
1000                 'uploader': info[u'username'],
1001                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
1002                 'title':    info[u'title'],
1003                 'ext':      video_extension,
1004                 'thumbnail': info[u'thumbUrl'],
1005             }]
1006
1007         # We try looking in other parts of the webpage
1008         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1009             webpage, u'video URL')
1010
1011         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1012         if mobj is None:
1013             raise ExtractorError(u'Unable to extract title')
1014         video_title = mobj.group(1).decode('utf-8')
1015         video_uploader = mobj.group(2).decode('utf-8')
1016
1017         return [{
1018             'id':       video_id.decode('utf-8'),
1019             'url':      video_url.decode('utf-8'),
1020             'uploader': video_uploader,
1021             'upload_date':  None,
1022             'title':    video_title,
1023             'ext':      video_extension.decode('utf-8'),
1024         }]
1025
1026
1027 class YahooIE(InfoExtractor):
1028     """Information extractor for screen.yahoo.com."""
1029     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1030
1031     def _real_extract(self, url):
1032         mobj = re.match(self._VALID_URL, url)
1033         if mobj is None:
1034             raise ExtractorError(u'Invalid URL: %s' % url)
1035         video_id = mobj.group('id')
1036         webpage = self._download_webpage(url, video_id)
1037         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1038
1039         if m_id is None:
1040             # TODO: Check which url parameters are required
1041             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1042             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1043             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1044                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1045                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1046                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1047                         '''
1048             self.report_extraction(video_id)
1049             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1050             if m_info is None:
1051                 raise ExtractorError(u'Unable to extract video info')
1052             video_title = m_info.group('title')
1053             video_description = m_info.group('description')
1054             video_thumb = m_info.group('thumb')
1055             video_date = m_info.group('date')
1056             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1057
1058             # TODO: Find a way to get mp4 videos
1059             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1060             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1061             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1062             video_url = m_rest.group('url')
1063             video_path = m_rest.group('path')
1064             if m_rest is None:
1065                 raise ExtractorError(u'Unable to extract video url')
1066
1067         else: # We have to use a different method if another id is defined
1068             long_id = m_id.group('new_id')
1069             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1070             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1071             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1072             info = json.loads(json_str)
1073             res = info[u'query'][u'results'][u'mediaObj'][0]
1074             stream = res[u'streams'][0]
1075             video_path = stream[u'path']
1076             video_url = stream[u'host']
1077             meta = res[u'meta']
1078             video_title = meta[u'title']
1079             video_description = meta[u'description']
1080             video_thumb = meta[u'thumbnail']
1081             video_date = None # I can't find it
1082
1083         info_dict = {
1084                      'id': video_id,
1085                      'url': video_url,
1086                      'play_path': video_path,
1087                      'title':video_title,
1088                      'description': video_description,
1089                      'thumbnail': video_thumb,
1090                      'upload_date': video_date,
1091                      'ext': 'flv',
1092                      }
1093         return info_dict
1094
1095 class VimeoIE(InfoExtractor):
1096     """Information extractor for vimeo.com."""
1097
1098     # _VALID_URL matches Vimeo URLs
1099     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1100     IE_NAME = u'vimeo'
1101
1102     def _real_extract(self, url, new_video=True):
1103         # Extract ID from URL
1104         mobj = re.match(self._VALID_URL, url)
1105         if mobj is None:
1106             raise ExtractorError(u'Invalid URL: %s' % url)
1107
1108         video_id = mobj.group('id')
1109         if not mobj.group('proto'):
1110             url = 'https://' + url
1111         if mobj.group('direct_link') or mobj.group('pro'):
1112             url = 'https://vimeo.com/' + video_id
1113
1114         # Retrieve video webpage to extract further information
1115         request = compat_urllib_request.Request(url, None, std_headers)
1116         webpage = self._download_webpage(request, video_id)
1117
1118         # Now we begin extracting as much information as we can from what we
1119         # retrieved. First we extract the information common to all extractors,
1120         # and latter we extract those that are Vimeo specific.
1121         self.report_extraction(video_id)
1122
1123         # Extract the config JSON
1124         try:
1125             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1126             config = json.loads(config)
1127         except:
1128             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1129                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1130             else:
1131                 raise ExtractorError(u'Unable to extract info section')
1132
1133         # Extract title
1134         video_title = config["video"]["title"]
1135
1136         # Extract uploader and uploader_id
1137         video_uploader = config["video"]["owner"]["name"]
1138         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1139
1140         # Extract video thumbnail
1141         video_thumbnail = config["video"]["thumbnail"]
1142
1143         # Extract video description
1144         video_description = get_element_by_attribute("itemprop", "description", webpage)
1145         if video_description: video_description = clean_html(video_description)
1146         else: video_description = u''
1147
1148         # Extract upload date
1149         video_upload_date = None
1150         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1151         if mobj is not None:
1152             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1153
1154         # Vimeo specific: extract request signature and timestamp
1155         sig = config['request']['signature']
1156         timestamp = config['request']['timestamp']
1157
1158         # Vimeo specific: extract video codec and quality information
1159         # First consider quality, then codecs, then take everything
1160         # TODO bind to format param
1161         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1162         files = { 'hd': [], 'sd': [], 'other': []}
1163         for codec_name, codec_extension in codecs:
1164             if codec_name in config["video"]["files"]:
1165                 if 'hd' in config["video"]["files"][codec_name]:
1166                     files['hd'].append((codec_name, codec_extension, 'hd'))
1167                 elif 'sd' in config["video"]["files"][codec_name]:
1168                     files['sd'].append((codec_name, codec_extension, 'sd'))
1169                 else:
1170                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1171
1172         for quality in ('hd', 'sd', 'other'):
1173             if len(files[quality]) > 0:
1174                 video_quality = files[quality][0][2]
1175                 video_codec = files[quality][0][0]
1176                 video_extension = files[quality][0][1]
1177                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1178                 break
1179         else:
1180             raise ExtractorError(u'No known codec found')
1181
1182         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1183                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1184
1185         return [{
1186             'id':       video_id,
1187             'url':      video_url,
1188             'uploader': video_uploader,
1189             'uploader_id': video_uploader_id,
1190             'upload_date':  video_upload_date,
1191             'title':    video_title,
1192             'ext':      video_extension,
1193             'thumbnail':    video_thumbnail,
1194             'description':  video_description,
1195         }]
1196
1197
1198 class ArteTvIE(InfoExtractor):
1199     """arte.tv information extractor."""
1200
1201     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1202     _LIVE_URL = r'index-[0-9]+\.html$'
1203
1204     IE_NAME = u'arte.tv'
1205
1206     def fetch_webpage(self, url):
1207         request = compat_urllib_request.Request(url)
1208         try:
1209             self.report_download_webpage(url)
1210             webpage = compat_urllib_request.urlopen(request).read()
1211         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1212             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1213         except ValueError as err:
1214             raise ExtractorError(u'Invalid URL: %s' % url)
1215         return webpage
1216
1217     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1218         page = self.fetch_webpage(url)
1219         mobj = re.search(regex, page, regexFlags)
1220         info = {}
1221
1222         if mobj is None:
1223             raise ExtractorError(u'Invalid URL: %s' % url)
1224
1225         for (i, key, err) in matchTuples:
1226             if mobj.group(i) is None:
1227                 raise ExtractorError(err)
1228             else:
1229                 info[key] = mobj.group(i)
1230
1231         return info
1232
1233     def extractLiveStream(self, url):
1234         video_lang = url.split('/')[-4]
1235         info = self.grep_webpage(
1236             url,
1237             r'src="(.*?/videothek_js.*?\.js)',
1238             0,
1239             [
1240                 (1, 'url', u'Invalid URL: %s' % url)
1241             ]
1242         )
1243         http_host = url.split('/')[2]
1244         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1245         info = self.grep_webpage(
1246             next_url,
1247             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1248                 '(http://.*?\.swf).*?' +
1249                 '(rtmp://.*?)\'',
1250             re.DOTALL,
1251             [
1252                 (1, 'path',   u'could not extract video path: %s' % url),
1253                 (2, 'player', u'could not extract video player: %s' % url),
1254                 (3, 'url',    u'could not extract video url: %s' % url)
1255             ]
1256         )
1257         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1258
1259     def extractPlus7Stream(self, url):
1260         video_lang = url.split('/')[-3]
1261         info = self.grep_webpage(
1262             url,
1263             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1264             0,
1265             [
1266                 (1, 'url', u'Invalid URL: %s' % url)
1267             ]
1268         )
1269         next_url = compat_urllib_parse.unquote(info.get('url'))
1270         info = self.grep_webpage(
1271             next_url,
1272             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1273             0,
1274             [
1275                 (1, 'url', u'Could not find <video> tag: %s' % url)
1276             ]
1277         )
1278         next_url = compat_urllib_parse.unquote(info.get('url'))
1279
1280         info = self.grep_webpage(
1281             next_url,
1282             r'<video id="(.*?)".*?>.*?' +
1283                 '<name>(.*?)</name>.*?' +
1284                 '<dateVideo>(.*?)</dateVideo>.*?' +
1285                 '<url quality="hd">(.*?)</url>',
1286             re.DOTALL,
1287             [
1288                 (1, 'id',    u'could not extract video id: %s' % url),
1289                 (2, 'title', u'could not extract video title: %s' % url),
1290                 (3, 'date',  u'could not extract video date: %s' % url),
1291                 (4, 'url',   u'could not extract video url: %s' % url)
1292             ]
1293         )
1294
1295         return {
1296             'id':           info.get('id'),
1297             'url':          compat_urllib_parse.unquote(info.get('url')),
1298             'uploader':     u'arte.tv',
1299             'upload_date':  unified_strdate(info.get('date')),
1300             'title':        info.get('title').decode('utf-8'),
1301             'ext':          u'mp4',
1302             'format':       u'NA',
1303             'player_url':   None,
1304         }
1305
1306     def _real_extract(self, url):
1307         video_id = url.split('/')[-1]
1308         self.report_extraction(video_id)
1309
1310         if re.search(self._LIVE_URL, video_id) is not None:
1311             self.extractLiveStream(url)
1312             return
1313         else:
1314             info = self.extractPlus7Stream(url)
1315
1316         return [info]
1317
1318
1319 class GenericIE(InfoExtractor):
1320     """Generic last-resort information extractor."""
1321
1322     _VALID_URL = r'.*'
1323     IE_NAME = u'generic'
1324
1325     def report_download_webpage(self, video_id):
1326         """Report webpage download."""
1327         if not self._downloader.params.get('test', False):
1328             self._downloader.report_warning(u'Falling back on generic information extractor.')
1329         super(GenericIE, self).report_download_webpage(video_id)
1330
1331     def report_following_redirect(self, new_url):
1332         """Report information extraction."""
1333         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1334
1335     def _test_redirect(self, url):
1336         """Check if it is a redirect, like url shorteners, in case return the new url."""
1337         class HeadRequest(compat_urllib_request.Request):
1338             def get_method(self):
1339                 return "HEAD"
1340
1341         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1342             """
1343             Subclass the HTTPRedirectHandler to make it use our
1344             HeadRequest also on the redirected URL
1345             """
1346             def redirect_request(self, req, fp, code, msg, headers, newurl):
1347                 if code in (301, 302, 303, 307):
1348                     newurl = newurl.replace(' ', '%20')
1349                     newheaders = dict((k,v) for k,v in req.headers.items()
1350                                       if k.lower() not in ("content-length", "content-type"))
1351                     return HeadRequest(newurl,
1352                                        headers=newheaders,
1353                                        origin_req_host=req.get_origin_req_host(),
1354                                        unverifiable=True)
1355                 else:
1356                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1357
1358         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1359             """
1360             Fallback to GET if HEAD is not allowed (405 HTTP error)
1361             """
1362             def http_error_405(self, req, fp, code, msg, headers):
1363                 fp.read()
1364                 fp.close()
1365
1366                 newheaders = dict((k,v) for k,v in req.headers.items()
1367                                   if k.lower() not in ("content-length", "content-type"))
1368                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1369                                                  headers=newheaders,
1370                                                  origin_req_host=req.get_origin_req_host(),
1371                                                  unverifiable=True))
1372
1373         # Build our opener
1374         opener = compat_urllib_request.OpenerDirector()
1375         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1376                         HTTPMethodFallback, HEADRedirectHandler,
1377                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1378             opener.add_handler(handler())
1379
1380         response = opener.open(HeadRequest(url))
1381         if response is None:
1382             raise ExtractorError(u'Invalid URL protocol')
1383         new_url = response.geturl()
1384
1385         if url == new_url:
1386             return False
1387
1388         self.report_following_redirect(new_url)
1389         return new_url
1390
1391     def _real_extract(self, url):
1392         new_url = self._test_redirect(url)
1393         if new_url: return [self.url_result(new_url)]
1394
1395         video_id = url.split('/')[-1]
1396         try:
1397             webpage = self._download_webpage(url, video_id)
1398         except ValueError as err:
1399             # since this is the last-resort InfoExtractor, if
1400             # this error is thrown, it'll be thrown here
1401             raise ExtractorError(u'Invalid URL: %s' % url)
1402
1403         self.report_extraction(video_id)
1404         # Start with something easy: JW Player in SWFObject
1405         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1406         if mobj is None:
1407             # Broaden the search a little bit
1408             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1409         if mobj is None:
1410             # Broaden the search a little bit: JWPlayer JS loader
1411             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1412         if mobj is None:
1413             raise ExtractorError(u'Invalid URL: %s' % url)
1414
1415         # It's possible that one of the regexes
1416         # matched, but returned an empty group:
1417         if mobj.group(1) is None:
1418             raise ExtractorError(u'Invalid URL: %s' % url)
1419
1420         video_url = compat_urllib_parse.unquote(mobj.group(1))
1421         video_id = os.path.basename(video_url)
1422
1423         # here's a fun little line of code for you:
1424         video_extension = os.path.splitext(video_id)[1][1:]
1425         video_id = os.path.splitext(video_id)[0]
1426
1427         # it's tempting to parse this further, but you would
1428         # have to take into account all the variations like
1429         #   Video Title - Site Name
1430         #   Site Name | Video Title
1431         #   Video Title - Tagline | Site Name
1432         # and so on and so forth; it's just not practical
1433         mobj = re.search(r'<title>(.*)</title>', webpage)
1434         if mobj is None:
1435             raise ExtractorError(u'Unable to extract title')
1436         video_title = mobj.group(1)
1437
1438         # video uploader is domain name
1439         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1440         if mobj is None:
1441             raise ExtractorError(u'Unable to extract title')
1442         video_uploader = mobj.group(1)
1443
1444         return [{
1445             'id':       video_id,
1446             'url':      video_url,
1447             'uploader': video_uploader,
1448             'upload_date':  None,
1449             'title':    video_title,
1450             'ext':      video_extension,
1451         }]
1452
1453
1454 class YoutubeSearchIE(SearchInfoExtractor):
1455     """Information Extractor for YouTube search queries."""
1456     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1457     _MAX_RESULTS = 1000
1458     IE_NAME = u'youtube:search'
1459     _SEARCH_KEY = 'ytsearch'
1460
1461     def report_download_page(self, query, pagenum):
1462         """Report attempt to download search page with given number."""
1463         query = query.decode(preferredencoding())
1464         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1465
1466     def _get_n_results(self, query, n):
1467         """Get a specified number of results for a query"""
1468
1469         video_ids = []
1470         pagenum = 0
1471         limit = n
1472
1473         while (50 * pagenum) < limit:
1474             self.report_download_page(query, pagenum+1)
1475             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1476             request = compat_urllib_request.Request(result_url)
1477             try:
1478                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1479             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1480                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1481             api_response = json.loads(data)['data']
1482
1483             if not 'items' in api_response:
1484                 raise ExtractorError(u'[youtube] No video results')
1485
1486             new_ids = list(video['id'] for video in api_response['items'])
1487             video_ids += new_ids
1488
1489             limit = min(n, api_response['totalItems'])
1490             pagenum += 1
1491
1492         if len(video_ids) > n:
1493             video_ids = video_ids[:n]
1494         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1495         return self.playlist_result(videos, query)
1496
1497
1498 class GoogleSearchIE(SearchInfoExtractor):
1499     """Information Extractor for Google Video search queries."""
1500     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1501     _MAX_RESULTS = 1000
1502     IE_NAME = u'video.google:search'
1503     _SEARCH_KEY = 'gvsearch'
1504
1505     def _get_n_results(self, query, n):
1506         """Get a specified number of results for a query"""
1507
1508         res = {
1509             '_type': 'playlist',
1510             'id': query,
1511             'entries': []
1512         }
1513
1514         for pagenum in itertools.count(1):
1515             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1516             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1517                                              note='Downloading result page ' + str(pagenum))
1518
1519             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1520                 e = {
1521                     '_type': 'url',
1522                     'url': mobj.group(1)
1523                 }
1524                 res['entries'].append(e)
1525
1526             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1527                 return res
1528
1529 class YahooSearchIE(SearchInfoExtractor):
1530     """Information Extractor for Yahoo! Video search queries."""
1531
1532     _MAX_RESULTS = 1000
1533     IE_NAME = u'screen.yahoo:search'
1534     _SEARCH_KEY = 'yvsearch'
1535
1536     def _get_n_results(self, query, n):
1537         """Get a specified number of results for a query"""
1538
1539         res = {
1540             '_type': 'playlist',
1541             'id': query,
1542             'entries': []
1543         }
1544         for pagenum in itertools.count(0):
1545             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1546             webpage = self._download_webpage(result_url, query,
1547                                              note='Downloading results page '+str(pagenum+1))
1548             info = json.loads(webpage)
1549             m = info[u'm']
1550             results = info[u'results']
1551
1552             for (i, r) in enumerate(results):
1553                 if (pagenum * 30) +i >= n:
1554                     break
1555                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1556                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1557                 res['entries'].append(e)
1558             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1559                 break
1560
1561         return res
1562
1563
1564 class YoutubePlaylistIE(InfoExtractor):
1565     """Information Extractor for YouTube playlists."""
1566
1567     _VALID_URL = r"""(?:
1568                         (?:https?://)?
1569                         (?:\w+\.)?
1570                         youtube\.com/
1571                         (?:
1572                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1573                            \? (?:.*?&)*? (?:p|a|list)=
1574                         |  p/
1575                         )
1576                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1577                         .*
1578                      |
1579                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1580                      )"""
1581     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1582     _MAX_RESULTS = 50
1583     IE_NAME = u'youtube:playlist'
1584
1585     @classmethod
1586     def suitable(cls, url):
1587         """Receives a URL and returns True if suitable for this IE."""
1588         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1589
1590     def _real_extract(self, url):
1591         # Extract playlist id
1592         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1593         if mobj is None:
1594             raise ExtractorError(u'Invalid URL: %s' % url)
1595
1596         # Download playlist videos from API
1597         playlist_id = mobj.group(1) or mobj.group(2)
1598         page_num = 1
1599         videos = []
1600
1601         while True:
1602             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1603             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1604
1605             try:
1606                 response = json.loads(page)
1607             except ValueError as err:
1608                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1609
1610             if 'feed' not in response:
1611                 raise ExtractorError(u'Got a malformed response from YouTube API')
1612             playlist_title = response['feed']['title']['$t']
1613             if 'entry' not in response['feed']:
1614                 # Number of videos is a multiple of self._MAX_RESULTS
1615                 break
1616
1617             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1618                         for entry in response['feed']['entry']
1619                         if 'content' in entry ]
1620
1621             if len(response['feed']['entry']) < self._MAX_RESULTS:
1622                 break
1623             page_num += 1
1624
1625         videos = [v[1] for v in sorted(videos)]
1626
1627         url_results = [self.url_result(url, 'Youtube') for url in videos]
1628         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1629
1630
1631 class YoutubeChannelIE(InfoExtractor):
1632     """Information Extractor for YouTube channels."""
1633
1634     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1635     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1636     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1637     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1638     IE_NAME = u'youtube:channel'
1639
1640     def extract_videos_from_page(self, page):
1641         ids_in_page = []
1642         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1643             if mobj.group(1) not in ids_in_page:
1644                 ids_in_page.append(mobj.group(1))
1645         return ids_in_page
1646
1647     def _real_extract(self, url):
1648         # Extract channel id
1649         mobj = re.match(self._VALID_URL, url)
1650         if mobj is None:
1651             raise ExtractorError(u'Invalid URL: %s' % url)
1652
1653         # Download channel page
1654         channel_id = mobj.group(1)
1655         video_ids = []
1656         pagenum = 1
1657
1658         url = self._TEMPLATE_URL % (channel_id, pagenum)
1659         page = self._download_webpage(url, channel_id,
1660                                       u'Downloading page #%s' % pagenum)
1661
1662         # Extract video identifiers
1663         ids_in_page = self.extract_videos_from_page(page)
1664         video_ids.extend(ids_in_page)
1665
1666         # Download any subsequent channel pages using the json-based channel_ajax query
1667         if self._MORE_PAGES_INDICATOR in page:
1668             while True:
1669                 pagenum = pagenum + 1
1670
1671                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1672                 page = self._download_webpage(url, channel_id,
1673                                               u'Downloading page #%s' % pagenum)
1674
1675                 page = json.loads(page)
1676
1677                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1678                 video_ids.extend(ids_in_page)
1679
1680                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1681                     break
1682
1683         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1684
1685         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1686         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1687         return [self.playlist_result(url_entries, channel_id)]
1688
1689
1690 class YoutubeUserIE(InfoExtractor):
1691     """Information Extractor for YouTube users."""
1692
1693     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1694     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1695     _GDATA_PAGE_SIZE = 50
1696     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1697     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1698     IE_NAME = u'youtube:user'
1699
1700     def _real_extract(self, url):
1701         # Extract username
1702         mobj = re.match(self._VALID_URL, url)
1703         if mobj is None:
1704             raise ExtractorError(u'Invalid URL: %s' % url)
1705
1706         username = mobj.group(1)
1707
1708         # Download video ids using YouTube Data API. Result size per
1709         # query is limited (currently to 50 videos) so we need to query
1710         # page by page until there are no video ids - it means we got
1711         # all of them.
1712
1713         video_ids = []
1714         pagenum = 0
1715
1716         while True:
1717             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1718
1719             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1720             page = self._download_webpage(gdata_url, username,
1721                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1722
1723             # Extract video identifiers
1724             ids_in_page = []
1725
1726             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1727                 if mobj.group(1) not in ids_in_page:
1728                     ids_in_page.append(mobj.group(1))
1729
1730             video_ids.extend(ids_in_page)
1731
1732             # A little optimization - if current page is not
1733             # "full", ie. does not contain PAGE_SIZE video ids then
1734             # we can assume that this page is the last one - there
1735             # are no more ids on further pages - no need to query
1736             # again.
1737
1738             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1739                 break
1740
1741             pagenum += 1
1742
1743         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1744         url_results = [self.url_result(url, 'Youtube') for url in urls]
1745         return [self.playlist_result(url_results, playlist_title = username)]
1746
1747
1748 class BlipTVUserIE(InfoExtractor):
1749     """Information Extractor for blip.tv users."""
1750
1751     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1752     _PAGE_SIZE = 12
1753     IE_NAME = u'blip.tv:user'
1754
1755     def _real_extract(self, url):
1756         # Extract username
1757         mobj = re.match(self._VALID_URL, url)
1758         if mobj is None:
1759             raise ExtractorError(u'Invalid URL: %s' % url)
1760
1761         username = mobj.group(1)
1762
1763         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1764
1765         page = self._download_webpage(url, username, u'Downloading user page')
1766         mobj = re.search(r'data-users-id="([^"]+)"', page)
1767         page_base = page_base % mobj.group(1)
1768
1769
1770         # Download video ids using BlipTV Ajax calls. Result size per
1771         # query is limited (currently to 12 videos) so we need to query
1772         # page by page until there are no video ids - it means we got
1773         # all of them.
1774
1775         video_ids = []
1776         pagenum = 1
1777
1778         while True:
1779             url = page_base + "&page=" + str(pagenum)
1780             page = self._download_webpage(url, username,
1781                                           u'Downloading video ids from page %d' % pagenum)
1782
1783             # Extract video identifiers
1784             ids_in_page = []
1785
1786             for mobj in re.finditer(r'href="/([^"]+)"', page):
1787                 if mobj.group(1) not in ids_in_page:
1788                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1789
1790             video_ids.extend(ids_in_page)
1791
1792             # A little optimization - if current page is not
1793             # "full", ie. does not contain PAGE_SIZE video ids then
1794             # we can assume that this page is the last one - there
1795             # are no more ids on further pages - no need to query
1796             # again.
1797
1798             if len(ids_in_page) < self._PAGE_SIZE:
1799                 break
1800
1801             pagenum += 1
1802
1803         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1804         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1805         return [self.playlist_result(url_entries, playlist_title = username)]
1806
1807
1808 class DepositFilesIE(InfoExtractor):
1809     """Information extractor for depositfiles.com"""
1810
1811     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1812
1813     def _real_extract(self, url):
1814         file_id = url.split('/')[-1]
1815         # Rebuild url in english locale
1816         url = 'http://depositfiles.com/en/files/' + file_id
1817
1818         # Retrieve file webpage with 'Free download' button pressed
1819         free_download_indication = { 'gateway_result' : '1' }
1820         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1821         try:
1822             self.report_download_webpage(file_id)
1823             webpage = compat_urllib_request.urlopen(request).read()
1824         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1825             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1826
1827         # Search for the real file URL
1828         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1829         if (mobj is None) or (mobj.group(1) is None):
1830             # Try to figure out reason of the error.
1831             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1832             if (mobj is not None) and (mobj.group(1) is not None):
1833                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1834                 raise ExtractorError(u'%s' % restriction_message)
1835             else:
1836                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1837
1838         file_url = mobj.group(1)
1839         file_extension = os.path.splitext(file_url)[1][1:]
1840
1841         # Search for file title
1842         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1843
1844         return [{
1845             'id':       file_id.decode('utf-8'),
1846             'url':      file_url.decode('utf-8'),
1847             'uploader': None,
1848             'upload_date':  None,
1849             'title':    file_title,
1850             'ext':      file_extension.decode('utf-8'),
1851         }]
1852
1853
1854 class FacebookIE(InfoExtractor):
1855     """Information Extractor for Facebook"""
1856
1857     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1858     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1859     _NETRC_MACHINE = 'facebook'
1860     IE_NAME = u'facebook'
1861
1862     def report_login(self):
1863         """Report attempt to log in."""
1864         self.to_screen(u'Logging in')
1865
1866     def _real_initialize(self):
1867         if self._downloader is None:
1868             return
1869
1870         useremail = None
1871         password = None
1872         downloader_params = self._downloader.params
1873
1874         # Attempt to use provided username and password or .netrc data
1875         if downloader_params.get('username', None) is not None:
1876             useremail = downloader_params['username']
1877             password = downloader_params['password']
1878         elif downloader_params.get('usenetrc', False):
1879             try:
1880                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1881                 if info is not None:
1882                     useremail = info[0]
1883                     password = info[2]
1884                 else:
1885                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1886             except (IOError, netrc.NetrcParseError) as err:
1887                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1888                 return
1889
1890         if useremail is None:
1891             return
1892
1893         # Log in
1894         login_form = {
1895             'email': useremail,
1896             'pass': password,
1897             'login': 'Log+In'
1898             }
1899         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1900         try:
1901             self.report_login()
1902             login_results = compat_urllib_request.urlopen(request).read()
1903             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1904                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1905                 return
1906         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1907             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1908             return
1909
1910     def _real_extract(self, url):
1911         mobj = re.match(self._VALID_URL, url)
1912         if mobj is None:
1913             raise ExtractorError(u'Invalid URL: %s' % url)
1914         video_id = mobj.group('ID')
1915
1916         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1917         webpage = self._download_webpage(url, video_id)
1918
1919         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1920         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1921         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1922         if not m:
1923             raise ExtractorError(u'Cannot parse data')
1924         data = dict(json.loads(m.group(1)))
1925         params_raw = compat_urllib_parse.unquote(data['params'])
1926         params = json.loads(params_raw)
1927         video_data = params['video_data'][0]
1928         video_url = video_data.get('hd_src')
1929         if not video_url:
1930             video_url = video_data['sd_src']
1931         if not video_url:
1932             raise ExtractorError(u'Cannot find video URL')
1933         video_duration = int(video_data['video_duration'])
1934         thumbnail = video_data['thumbnail_src']
1935
1936         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1937             webpage, u'title')
1938
1939         info = {
1940             'id': video_id,
1941             'title': video_title,
1942             'url': video_url,
1943             'ext': 'mp4',
1944             'duration': video_duration,
1945             'thumbnail': thumbnail,
1946         }
1947         return [info]
1948
1949
1950 class BlipTVIE(InfoExtractor):
1951     """Information extractor for blip.tv"""
1952
1953     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1954     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1955     IE_NAME = u'blip.tv'
1956
1957     def report_direct_download(self, title):
1958         """Report information extraction."""
1959         self.to_screen(u'%s: Direct download detected' % title)
1960
1961     def _real_extract(self, url):
1962         mobj = re.match(self._VALID_URL, url)
1963         if mobj is None:
1964             raise ExtractorError(u'Invalid URL: %s' % url)
1965
1966         # See https://github.com/rg3/youtube-dl/issues/857
1967         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1968         if api_mobj is not None:
1969             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1970         urlp = compat_urllib_parse_urlparse(url)
1971         if urlp.path.startswith('/play/'):
1972             request = compat_urllib_request.Request(url)
1973             response = compat_urllib_request.urlopen(request)
1974             redirecturl = response.geturl()
1975             rurlp = compat_urllib_parse_urlparse(redirecturl)
1976             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1977             url = 'http://blip.tv/a/a-' + file_id
1978             return self._real_extract(url)
1979
1980
1981         if '?' in url:
1982             cchar = '&'
1983         else:
1984             cchar = '?'
1985         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1986         request = compat_urllib_request.Request(json_url)
1987         request.add_header('User-Agent', 'iTunes/10.6.1')
1988         self.report_extraction(mobj.group(1))
1989         info = None
1990         try:
1991             urlh = compat_urllib_request.urlopen(request)
1992             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1993                 basename = url.split('/')[-1]
1994                 title,ext = os.path.splitext(basename)
1995                 title = title.decode('UTF-8')
1996                 ext = ext.replace('.', '')
1997                 self.report_direct_download(title)
1998                 info = {
1999                     'id': title,
2000                     'url': url,
2001                     'uploader': None,
2002                     'upload_date': None,
2003                     'title': title,
2004                     'ext': ext,
2005                     'urlhandle': urlh
2006                 }
2007         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2008             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2009         if info is None: # Regular URL
2010             try:
2011                 json_code_bytes = urlh.read()
2012                 json_code = json_code_bytes.decode('utf-8')
2013             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2014                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2015
2016             try:
2017                 json_data = json.loads(json_code)
2018                 if 'Post' in json_data:
2019                     data = json_data['Post']
2020                 else:
2021                     data = json_data
2022
2023                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2024                 video_url = data['media']['url']
2025                 umobj = re.match(self._URL_EXT, video_url)
2026                 if umobj is None:
2027                     raise ValueError('Can not determine filename extension')
2028                 ext = umobj.group(1)
2029
2030                 info = {
2031                     'id': data['item_id'],
2032                     'url': video_url,
2033                     'uploader': data['display_name'],
2034                     'upload_date': upload_date,
2035                     'title': data['title'],
2036                     'ext': ext,
2037                     'format': data['media']['mimeType'],
2038                     'thumbnail': data['thumbnailUrl'],
2039                     'description': data['description'],
2040                     'player_url': data['embedUrl'],
2041                     'user_agent': 'iTunes/10.6.1',
2042                 }
2043             except (ValueError,KeyError) as err:
2044                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2045
2046         return [info]
2047
2048
2049 class MyVideoIE(InfoExtractor):
2050     """Information Extractor for myvideo.de."""
2051
2052     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2053     IE_NAME = u'myvideo'
2054
2055     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2056     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2057     # https://github.com/rg3/youtube-dl/pull/842
2058     def __rc4crypt(self,data, key):
2059         x = 0
2060         box = list(range(256))
2061         for i in list(range(256)):
2062             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2063             box[i], box[x] = box[x], box[i]
2064         x = 0
2065         y = 0
2066         out = ''
2067         for char in data:
2068             x = (x + 1) % 256
2069             y = (y + box[x]) % 256
2070             box[x], box[y] = box[y], box[x]
2071             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2072         return out
2073
2074     def __md5(self,s):
2075         return hashlib.md5(s).hexdigest().encode()
2076
2077     def _real_extract(self,url):
2078         mobj = re.match(self._VALID_URL, url)
2079         if mobj is None:
2080             raise ExtractorError(u'invalid URL: %s' % url)
2081
2082         video_id = mobj.group(1)
2083
2084         GK = (
2085           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2086           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2087           b'TnpsbA0KTVRkbU1tSTRNdz09'
2088         )
2089
2090         # Get video webpage
2091         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2092         webpage = self._download_webpage(webpage_url, video_id)
2093
2094         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2095         if mobj is not None:
2096             self.report_extraction(video_id)
2097             video_url = mobj.group(1) + '.flv'
2098
2099             video_title = self._html_search_regex('<title>([^<]+)</title>',
2100                 webpage, u'title')
2101
2102             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2103
2104             return [{
2105                 'id':       video_id,
2106                 'url':      video_url,
2107                 'uploader': None,
2108                 'upload_date':  None,
2109                 'title':    video_title,
2110                 'ext':      u'flv',
2111             }]
2112
2113         # try encxml
2114         mobj = re.search('var flashvars={(.+?)}', webpage)
2115         if mobj is None:
2116             raise ExtractorError(u'Unable to extract video')
2117
2118         params = {}
2119         encxml = ''
2120         sec = mobj.group(1)
2121         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2122             if not a == '_encxml':
2123                 params[a] = b
2124             else:
2125                 encxml = compat_urllib_parse.unquote(b)
2126         if not params.get('domain'):
2127             params['domain'] = 'www.myvideo.de'
2128         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2129         if 'flash_playertype=MTV' in xmldata_url:
2130             self._downloader.report_warning(u'avoiding MTV player')
2131             xmldata_url = (
2132                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2133                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2134             ) % video_id
2135
2136         # get enc data
2137         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2138         enc_data_b = binascii.unhexlify(enc_data)
2139         sk = self.__md5(
2140             base64.b64decode(base64.b64decode(GK)) +
2141             self.__md5(
2142                 str(video_id).encode('utf-8')
2143             )
2144         )
2145         dec_data = self.__rc4crypt(enc_data_b, sk)
2146
2147         # extracting infos
2148         self.report_extraction(video_id)
2149
2150         video_url = None
2151         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2152         if mobj:
2153             video_url = compat_urllib_parse.unquote(mobj.group(1))
2154             if 'myvideo2flash' in video_url:
2155                 self._downloader.report_warning(u'forcing RTMPT ...')
2156                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2157
2158         if not video_url:
2159             # extract non rtmp videos
2160             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2161             if mobj is None:
2162                 raise ExtractorError(u'unable to extract url')
2163             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2164
2165         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2166         video_file = compat_urllib_parse.unquote(video_file)
2167
2168         if not video_file.endswith('f4m'):
2169             ppath, prefix = video_file.split('.')
2170             video_playpath = '%s:%s' % (prefix, ppath)
2171             video_hls_playlist = ''
2172         else:
2173             video_playpath = ''
2174             video_hls_playlist = (
2175                 video_filepath + video_file
2176             ).replace('.f4m', '.m3u8')
2177
2178         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2179         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2180
2181         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2182             webpage, u'title')
2183
2184         return [{
2185             'id':                 video_id,
2186             'url':                video_url,
2187             'tc_url':             video_url,
2188             'uploader':           None,
2189             'upload_date':        None,
2190             'title':              video_title,
2191             'ext':                u'flv',
2192             'play_path':          video_playpath,
2193             'video_file':         video_file,
2194             'video_hls_playlist': video_hls_playlist,
2195             'player_url':         video_swfobj,
2196         }]
2197
2198
2199 class ComedyCentralIE(InfoExtractor):
2200     """Information extractor for The Daily Show and Colbert Report """
2201
2202     # urls can be abbreviations like :thedailyshow or :colbert
2203     # urls for episodes like:
2204     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2205     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2206     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2207     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2208                       |(https?://)?(www\.)?
2209                           (?P<showname>thedailyshow|colbertnation)\.com/
2210                          (full-episodes/(?P<episode>.*)|
2211                           (?P<clip>
2212                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2213                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2214                      $"""
2215
2216     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2217
2218     _video_extensions = {
2219         '3500': 'mp4',
2220         '2200': 'mp4',
2221         '1700': 'mp4',
2222         '1200': 'mp4',
2223         '750': 'mp4',
2224         '400': 'mp4',
2225     }
2226     _video_dimensions = {
2227         '3500': '1280x720',
2228         '2200': '960x540',
2229         '1700': '768x432',
2230         '1200': '640x360',
2231         '750': '512x288',
2232         '400': '384x216',
2233     }
2234
2235     @classmethod
2236     def suitable(cls, url):
2237         """Receives a URL and returns True if suitable for this IE."""
2238         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2239
2240     def _print_formats(self, formats):
2241         print('Available formats:')
2242         for x in formats:
2243             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2244
2245
2246     def _real_extract(self, url):
2247         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2248         if mobj is None:
2249             raise ExtractorError(u'Invalid URL: %s' % url)
2250
2251         if mobj.group('shortname'):
2252             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2253                 url = u'http://www.thedailyshow.com/full-episodes/'
2254             else:
2255                 url = u'http://www.colbertnation.com/full-episodes/'
2256             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2257             assert mobj is not None
2258
2259         if mobj.group('clip'):
2260             if mobj.group('showname') == 'thedailyshow':
2261                 epTitle = mobj.group('tdstitle')
2262             else:
2263                 epTitle = mobj.group('cntitle')
2264             dlNewest = False
2265         else:
2266             dlNewest = not mobj.group('episode')
2267             if dlNewest:
2268                 epTitle = mobj.group('showname')
2269             else:
2270                 epTitle = mobj.group('episode')
2271
2272         self.report_extraction(epTitle)
2273         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2274         if dlNewest:
2275             url = htmlHandle.geturl()
2276             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2277             if mobj is None:
2278                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2279             if mobj.group('episode') == '':
2280                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2281             epTitle = mobj.group('episode')
2282
2283         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2284
2285         if len(mMovieParams) == 0:
2286             # The Colbert Report embeds the information in a without
2287             # a URL prefix; so extract the alternate reference
2288             # and then add the URL prefix manually.
2289
2290             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2291             if len(altMovieParams) == 0:
2292                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2293             else:
2294                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2295
2296         uri = mMovieParams[0][1]
2297         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2298         indexXml = self._download_webpage(indexUrl, epTitle,
2299                                           u'Downloading show index',
2300                                           u'unable to download episode index')
2301
2302         results = []
2303
2304         idoc = xml.etree.ElementTree.fromstring(indexXml)
2305         itemEls = idoc.findall('.//item')
2306         for partNum,itemEl in enumerate(itemEls):
2307             mediaId = itemEl.findall('./guid')[0].text
2308             shortMediaId = mediaId.split(':')[-1]
2309             showId = mediaId.split(':')[-2].replace('.com', '')
2310             officialTitle = itemEl.findall('./title')[0].text
2311             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2312
2313             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2314                         compat_urllib_parse.urlencode({'uri': mediaId}))
2315             configXml = self._download_webpage(configUrl, epTitle,
2316                                                u'Downloading configuration for %s' % shortMediaId)
2317
2318             cdoc = xml.etree.ElementTree.fromstring(configXml)
2319             turls = []
2320             for rendition in cdoc.findall('.//rendition'):
2321                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2322                 turls.append(finfo)
2323
2324             if len(turls) == 0:
2325                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2326                 continue
2327
2328             if self._downloader.params.get('listformats', None):
2329                 self._print_formats([i[0] for i in turls])
2330                 return
2331
2332             # For now, just pick the highest bitrate
2333             format,rtmp_video_url = turls[-1]
2334
2335             # Get the format arg from the arg stream
2336             req_format = self._downloader.params.get('format', None)
2337
2338             # Select format if we can find one
2339             for f,v in turls:
2340                 if f == req_format:
2341                     format, rtmp_video_url = f, v
2342                     break
2343
2344             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2345             if not m:
2346                 raise ExtractorError(u'Cannot transform RTMP url')
2347             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2348             video_url = base + m.group('finalid')
2349
2350             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2351             info = {
2352                 'id': shortMediaId,
2353                 'url': video_url,
2354                 'uploader': showId,
2355                 'upload_date': officialDate,
2356                 'title': effTitle,
2357                 'ext': 'mp4',
2358                 'format': format,
2359                 'thumbnail': None,
2360                 'description': officialTitle,
2361             }
2362             results.append(info)
2363
2364         return results
2365
2366
2367 class EscapistIE(InfoExtractor):
2368     """Information extractor for The Escapist """
2369
2370     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2371     IE_NAME = u'escapist'
2372
2373     def _real_extract(self, url):
2374         mobj = re.match(self._VALID_URL, url)
2375         if mobj is None:
2376             raise ExtractorError(u'Invalid URL: %s' % url)
2377         showName = mobj.group('showname')
2378         videoId = mobj.group('episode')
2379
2380         self.report_extraction(videoId)
2381         webpage = self._download_webpage(url, videoId)
2382
2383         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2384             webpage, u'description', fatal=False)
2385
2386         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2387             webpage, u'thumbnail', fatal=False)
2388
2389         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2390             webpage, u'player url')
2391
2392         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2393             webpage, u'player url').split(' : ')[-1]
2394
2395         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2396         configUrl = compat_urllib_parse.unquote(configUrl)
2397
2398         configJSON = self._download_webpage(configUrl, videoId,
2399                                             u'Downloading configuration',
2400                                             u'unable to download configuration')
2401
2402         # Technically, it's JavaScript, not JSON
2403         configJSON = configJSON.replace("'", '"')
2404
2405         try:
2406             config = json.loads(configJSON)
2407         except (ValueError,) as err:
2408             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2409
2410         playlist = config['playlist']
2411         videoUrl = playlist[1]['url']
2412
2413         info = {
2414             'id': videoId,
2415             'url': videoUrl,
2416             'uploader': showName,
2417             'upload_date': None,
2418             'title': title,
2419             'ext': 'mp4',
2420             'thumbnail': imgUrl,
2421             'description': videoDesc,
2422             'player_url': playerUrl,
2423         }
2424
2425         return [info]
2426
2427 class CollegeHumorIE(InfoExtractor):
2428     """Information extractor for collegehumor.com"""
2429
2430     _WORKING = False
2431     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2432     IE_NAME = u'collegehumor'
2433
2434     def report_manifest(self, video_id):
2435         """Report information extraction."""
2436         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2437
2438     def _real_extract(self, url):
2439         mobj = re.match(self._VALID_URL, url)
2440         if mobj is None:
2441             raise ExtractorError(u'Invalid URL: %s' % url)
2442         video_id = mobj.group('videoid')
2443
2444         info = {
2445             'id': video_id,
2446             'uploader': None,
2447             'upload_date': None,
2448         }
2449
2450         self.report_extraction(video_id)
2451         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2452         try:
2453             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2454         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2455             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2456
2457         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2458         try:
2459             videoNode = mdoc.findall('./video')[0]
2460             info['description'] = videoNode.findall('./description')[0].text
2461             info['title'] = videoNode.findall('./caption')[0].text
2462             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2463             manifest_url = videoNode.findall('./file')[0].text
2464         except IndexError:
2465             raise ExtractorError(u'Invalid metadata XML file')
2466
2467         manifest_url += '?hdcore=2.10.3'
2468         self.report_manifest(video_id)
2469         try:
2470             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2471         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2472             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2473
2474         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2475         try:
2476             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2477             node_id = media_node.attrib['url']
2478             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2479         except IndexError as err:
2480             raise ExtractorError(u'Invalid manifest file')
2481
2482         url_pr = compat_urllib_parse_urlparse(manifest_url)
2483         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2484
2485         info['url'] = url
2486         info['ext'] = 'f4f'
2487         return [info]
2488
2489
2490 class XVideosIE(InfoExtractor):
2491     """Information extractor for xvideos.com"""
2492
2493     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2494     IE_NAME = u'xvideos'
2495
2496     def _real_extract(self, url):
2497         mobj = re.match(self._VALID_URL, url)
2498         if mobj is None:
2499             raise ExtractorError(u'Invalid URL: %s' % url)
2500         video_id = mobj.group(1)
2501
2502         webpage = self._download_webpage(url, video_id)
2503
2504         self.report_extraction(video_id)
2505
2506         # Extract video URL
2507         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2508             webpage, u'video URL'))
2509
2510         # Extract title
2511         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2512             webpage, u'title')
2513
2514         # Extract video thumbnail
2515         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2516             webpage, u'thumbnail', fatal=False)
2517
2518         info = {
2519             'id': video_id,
2520             'url': video_url,
2521             'uploader': None,
2522             'upload_date': None,
2523             'title': video_title,
2524             'ext': 'flv',
2525             'thumbnail': video_thumbnail,
2526             'description': None,
2527         }
2528
2529         return [info]
2530
2531
2532 class SoundcloudIE(InfoExtractor):
2533     """Information extractor for soundcloud.com
2534        To access the media, the uid of the song and a stream token
2535        must be extracted from the page source and the script must make
2536        a request to media.soundcloud.com/crossdomain.xml. Then
2537        the media can be grabbed by requesting from an url composed
2538        of the stream token and uid
2539      """
2540
2541     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2542     IE_NAME = u'soundcloud'
2543
2544     def report_resolve(self, video_id):
2545         """Report information extraction."""
2546         self.to_screen(u'%s: Resolving id' % video_id)
2547
2548     def _real_extract(self, url):
2549         mobj = re.match(self._VALID_URL, url)
2550         if mobj is None:
2551             raise ExtractorError(u'Invalid URL: %s' % url)
2552
2553         # extract uploader (which is in the url)
2554         uploader = mobj.group(1)
2555         # extract simple title (uploader + slug of song title)
2556         slug_title =  mobj.group(2)
2557         simple_title = uploader + u'-' + slug_title
2558         full_title = '%s/%s' % (uploader, slug_title)
2559
2560         self.report_resolve(full_title)
2561
2562         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2563         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2564         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2565
2566         info = json.loads(info_json)
2567         video_id = info['id']
2568         self.report_extraction(full_title)
2569
2570         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2571         stream_json = self._download_webpage(streams_url, full_title,
2572                                              u'Downloading stream definitions',
2573                                              u'unable to download stream definitions')
2574
2575         streams = json.loads(stream_json)
2576         mediaURL = streams['http_mp3_128_url']
2577         upload_date = unified_strdate(info['created_at'])
2578
2579         return [{
2580             'id':       info['id'],
2581             'url':      mediaURL,
2582             'uploader': info['user']['username'],
2583             'upload_date': upload_date,
2584             'title':    info['title'],
2585             'ext':      u'mp3',
2586             'description': info['description'],
2587         }]
2588
2589 class SoundcloudSetIE(InfoExtractor):
2590     """Information extractor for soundcloud.com sets
2591        To access the media, the uid of the song and a stream token
2592        must be extracted from the page source and the script must make
2593        a request to media.soundcloud.com/crossdomain.xml. Then
2594        the media can be grabbed by requesting from an url composed
2595        of the stream token and uid
2596      """
2597
2598     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2599     IE_NAME = u'soundcloud:set'
2600
2601     def report_resolve(self, video_id):
2602         """Report information extraction."""
2603         self.to_screen(u'%s: Resolving id' % video_id)
2604
2605     def _real_extract(self, url):
2606         mobj = re.match(self._VALID_URL, url)
2607         if mobj is None:
2608             raise ExtractorError(u'Invalid URL: %s' % url)
2609
2610         # extract uploader (which is in the url)
2611         uploader = mobj.group(1)
2612         # extract simple title (uploader + slug of song title)
2613         slug_title =  mobj.group(2)
2614         simple_title = uploader + u'-' + slug_title
2615         full_title = '%s/sets/%s' % (uploader, slug_title)
2616
2617         self.report_resolve(full_title)
2618
2619         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2620         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2621         info_json = self._download_webpage(resolv_url, full_title)
2622
2623         videos = []
2624         info = json.loads(info_json)
2625         if 'errors' in info:
2626             for err in info['errors']:
2627                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2628             return
2629
2630         self.report_extraction(full_title)
2631         for track in info['tracks']:
2632             video_id = track['id']
2633
2634             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2635             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2636
2637             self.report_extraction(video_id)
2638             streams = json.loads(stream_json)
2639             mediaURL = streams['http_mp3_128_url']
2640
2641             videos.append({
2642                 'id':       video_id,
2643                 'url':      mediaURL,
2644                 'uploader': track['user']['username'],
2645                 'upload_date':  unified_strdate(track['created_at']),
2646                 'title':    track['title'],
2647                 'ext':      u'mp3',
2648                 'description': track['description'],
2649             })
2650         return videos
2651
2652
2653 class InfoQIE(InfoExtractor):
2654     """Information extractor for infoq.com"""
2655     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2656
2657     def _real_extract(self, url):
2658         mobj = re.match(self._VALID_URL, url)
2659         if mobj is None:
2660             raise ExtractorError(u'Invalid URL: %s' % url)
2661
2662         webpage = self._download_webpage(url, video_id=url)
2663         self.report_extraction(url)
2664
2665         # Extract video URL
2666         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2667         if mobj is None:
2668             raise ExtractorError(u'Unable to extract video url')
2669         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2670         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2671
2672         # Extract title
2673         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2674             webpage, u'title')
2675
2676         # Extract description
2677         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2678             webpage, u'description', fatal=False)
2679
2680         video_filename = video_url.split('/')[-1]
2681         video_id, extension = video_filename.split('.')
2682
2683         info = {
2684             'id': video_id,
2685             'url': video_url,
2686             'uploader': None,
2687             'upload_date': None,
2688             'title': video_title,
2689             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2690             'thumbnail': None,
2691             'description': video_description,
2692         }
2693
2694         return [info]
2695
2696 class MixcloudIE(InfoExtractor):
2697     """Information extractor for www.mixcloud.com"""
2698
2699     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2700     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2701     IE_NAME = u'mixcloud'
2702
2703     def report_download_json(self, file_id):
2704         """Report JSON download."""
2705         self.to_screen(u'Downloading json')
2706
2707     def get_urls(self, jsonData, fmt, bitrate='best'):
2708         """Get urls from 'audio_formats' section in json"""
2709         file_url = None
2710         try:
2711             bitrate_list = jsonData[fmt]
2712             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2713                 bitrate = max(bitrate_list) # select highest
2714
2715             url_list = jsonData[fmt][bitrate]
2716         except TypeError: # we have no bitrate info.
2717             url_list = jsonData[fmt]
2718         return url_list
2719
2720     def check_urls(self, url_list):
2721         """Returns 1st active url from list"""
2722         for url in url_list:
2723             try:
2724                 compat_urllib_request.urlopen(url)
2725                 return url
2726             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2727                 url = None
2728
2729         return None
2730
2731     def _print_formats(self, formats):
2732         print('Available formats:')
2733         for fmt in formats.keys():
2734             for b in formats[fmt]:
2735                 try:
2736                     ext = formats[fmt][b][0]
2737                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2738                 except TypeError: # we have no bitrate info
2739                     ext = formats[fmt][0]
2740                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2741                     break
2742
2743     def _real_extract(self, url):
2744         mobj = re.match(self._VALID_URL, url)
2745         if mobj is None:
2746             raise ExtractorError(u'Invalid URL: %s' % url)
2747         # extract uploader & filename from url
2748         uploader = mobj.group(1).decode('utf-8')
2749         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2750
2751         # construct API request
2752         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2753         # retrieve .json file with links to files
2754         request = compat_urllib_request.Request(file_url)
2755         try:
2756             self.report_download_json(file_url)
2757             jsonData = compat_urllib_request.urlopen(request).read()
2758         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2759             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2760
2761         # parse JSON
2762         json_data = json.loads(jsonData)
2763         player_url = json_data['player_swf_url']
2764         formats = dict(json_data['audio_formats'])
2765
2766         req_format = self._downloader.params.get('format', None)
2767         bitrate = None
2768
2769         if self._downloader.params.get('listformats', None):
2770             self._print_formats(formats)
2771             return
2772
2773         if req_format is None or req_format == 'best':
2774             for format_param in formats.keys():
2775                 url_list = self.get_urls(formats, format_param)
2776                 # check urls
2777                 file_url = self.check_urls(url_list)
2778                 if file_url is not None:
2779                     break # got it!
2780         else:
2781             if req_format not in formats:
2782                 raise ExtractorError(u'Format is not available')
2783
2784             url_list = self.get_urls(formats, req_format)
2785             file_url = self.check_urls(url_list)
2786             format_param = req_format
2787
2788         return [{
2789             'id': file_id.decode('utf-8'),
2790             'url': file_url.decode('utf-8'),
2791             'uploader': uploader.decode('utf-8'),
2792             'upload_date': None,
2793             'title': json_data['name'],
2794             'ext': file_url.split('.')[-1].decode('utf-8'),
2795             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2796             'thumbnail': json_data['thumbnail_url'],
2797             'description': json_data['description'],
2798             'player_url': player_url.decode('utf-8'),
2799         }]
2800
2801 class StanfordOpenClassroomIE(InfoExtractor):
2802     """Information extractor for Stanford's Open ClassRoom"""
2803
2804     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2805     IE_NAME = u'stanfordoc'
2806
2807     def _real_extract(self, url):
2808         mobj = re.match(self._VALID_URL, url)
2809         if mobj is None:
2810             raise ExtractorError(u'Invalid URL: %s' % url)
2811
2812         if mobj.group('course') and mobj.group('video'): # A specific video
2813             course = mobj.group('course')
2814             video = mobj.group('video')
2815             info = {
2816                 'id': course + '_' + video,
2817                 'uploader': None,
2818                 'upload_date': None,
2819             }
2820
2821             self.report_extraction(info['id'])
2822             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2823             xmlUrl = baseUrl + video + '.xml'
2824             try:
2825                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2826             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2827                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2828             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2829             try:
2830                 info['title'] = mdoc.findall('./title')[0].text
2831                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2832             except IndexError:
2833                 raise ExtractorError(u'Invalid metadata XML file')
2834             info['ext'] = info['url'].rpartition('.')[2]
2835             return [info]
2836         elif mobj.group('course'): # A course page
2837             course = mobj.group('course')
2838             info = {
2839                 'id': course,
2840                 'type': 'playlist',
2841                 'uploader': None,
2842                 'upload_date': None,
2843             }
2844
2845             coursepage = self._download_webpage(url, info['id'],
2846                                         note='Downloading course info page',
2847                                         errnote='Unable to download course info page')
2848
2849             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2850
2851             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2852                 coursepage, u'description', fatal=False)
2853
2854             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2855             info['list'] = [
2856                 {
2857                     'type': 'reference',
2858                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2859                 }
2860                     for vpage in links]
2861             results = []
2862             for entry in info['list']:
2863                 assert entry['type'] == 'reference'
2864                 results += self.extract(entry['url'])
2865             return results
2866         else: # Root page
2867             info = {
2868                 'id': 'Stanford OpenClassroom',
2869                 'type': 'playlist',
2870                 'uploader': None,
2871                 'upload_date': None,
2872             }
2873
2874             self.report_download_webpage(info['id'])
2875             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2876             try:
2877                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2878             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2879                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2880
2881             info['title'] = info['id']
2882
2883             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2884             info['list'] = [
2885                 {
2886                     'type': 'reference',
2887                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2888                 }
2889                     for cpage in links]
2890
2891             results = []
2892             for entry in info['list']:
2893                 assert entry['type'] == 'reference'
2894                 results += self.extract(entry['url'])
2895             return results
2896
2897 class MTVIE(InfoExtractor):
2898     """Information extractor for MTV.com"""
2899
2900     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2901     IE_NAME = u'mtv'
2902
2903     def _real_extract(self, url):
2904         mobj = re.match(self._VALID_URL, url)
2905         if mobj is None:
2906             raise ExtractorError(u'Invalid URL: %s' % url)
2907         if not mobj.group('proto'):
2908             url = 'http://' + url
2909         video_id = mobj.group('videoid')
2910
2911         webpage = self._download_webpage(url, video_id)
2912
2913         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2914             webpage, u'song name', fatal=False)
2915
2916         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2917             webpage, u'title')
2918
2919         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2920             webpage, u'mtvn_uri', fatal=False)
2921
2922         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2923             webpage, u'content id', fatal=False)
2924
2925         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2926         self.report_extraction(video_id)
2927         request = compat_urllib_request.Request(videogen_url)
2928         try:
2929             metadataXml = compat_urllib_request.urlopen(request).read()
2930         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2931             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2932
2933         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2934         renditions = mdoc.findall('.//rendition')
2935
2936         # For now, always pick the highest quality.
2937         rendition = renditions[-1]
2938
2939         try:
2940             _,_,ext = rendition.attrib['type'].partition('/')
2941             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2942             video_url = rendition.find('./src').text
2943         except KeyError:
2944             raise ExtractorError('Invalid rendition field.')
2945
2946         info = {
2947             'id': video_id,
2948             'url': video_url,
2949             'uploader': performer,
2950             'upload_date': None,
2951             'title': video_title,
2952             'ext': ext,
2953             'format': format,
2954         }
2955
2956         return [info]
2957
2958
2959 class YoukuIE(InfoExtractor):
2960     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2961
2962     def _gen_sid(self):
2963         nowTime = int(time.time() * 1000)
2964         random1 = random.randint(1000,1998)
2965         random2 = random.randint(1000,9999)
2966
2967         return "%d%d%d" %(nowTime,random1,random2)
2968
2969     def _get_file_ID_mix_string(self, seed):
2970         mixed = []
2971         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2972         seed = float(seed)
2973         for i in range(len(source)):
2974             seed  =  (seed * 211 + 30031 ) % 65536
2975             index  =  math.floor(seed / 65536 * len(source) )
2976             mixed.append(source[int(index)])
2977             source.remove(source[int(index)])
2978         #return ''.join(mixed)
2979         return mixed
2980
2981     def _get_file_id(self, fileId, seed):
2982         mixed = self._get_file_ID_mix_string(seed)
2983         ids = fileId.split('*')
2984         realId = []
2985         for ch in ids:
2986             if ch:
2987                 realId.append(mixed[int(ch)])
2988         return ''.join(realId)
2989
2990     def _real_extract(self, url):
2991         mobj = re.match(self._VALID_URL, url)
2992         if mobj is None:
2993             raise ExtractorError(u'Invalid URL: %s' % url)
2994         video_id = mobj.group('ID')
2995
2996         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2997
2998         jsondata = self._download_webpage(info_url, video_id)
2999
3000         self.report_extraction(video_id)
3001         try:
3002             config = json.loads(jsondata)
3003
3004             video_title =  config['data'][0]['title']
3005             seed = config['data'][0]['seed']
3006
3007             format = self._downloader.params.get('format', None)
3008             supported_format = list(config['data'][0]['streamfileids'].keys())
3009
3010             if format is None or format == 'best':
3011                 if 'hd2' in supported_format:
3012                     format = 'hd2'
3013                 else:
3014                     format = 'flv'
3015                 ext = u'flv'
3016             elif format == 'worst':
3017                 format = 'mp4'
3018                 ext = u'mp4'
3019             else:
3020                 format = 'flv'
3021                 ext = u'flv'
3022
3023
3024             fileid = config['data'][0]['streamfileids'][format]
3025             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3026         except (UnicodeDecodeError, ValueError, KeyError):
3027             raise ExtractorError(u'Unable to extract info section')
3028
3029         files_info=[]
3030         sid = self._gen_sid()
3031         fileid = self._get_file_id(fileid, seed)
3032
3033         #column 8,9 of fileid represent the segment number
3034         #fileid[7:9] should be changed
3035         for index, key in enumerate(keys):
3036
3037             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3038             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3039
3040             info = {
3041                 'id': '%s_part%02d' % (video_id, index),
3042                 'url': download_url,
3043                 'uploader': None,
3044                 'upload_date': None,
3045                 'title': video_title,
3046                 'ext': ext,
3047             }
3048             files_info.append(info)
3049
3050         return files_info
3051
3052
3053 class XNXXIE(InfoExtractor):
3054     """Information extractor for xnxx.com"""
3055
3056     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3057     IE_NAME = u'xnxx'
3058     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3059     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3060     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3061
3062     def _real_extract(self, url):
3063         mobj = re.match(self._VALID_URL, url)
3064         if mobj is None:
3065             raise ExtractorError(u'Invalid URL: %s' % url)
3066         video_id = mobj.group(1)
3067
3068         # Get webpage content
3069         webpage = self._download_webpage(url, video_id)
3070
3071         video_url = self._search_regex(self.VIDEO_URL_RE,
3072             webpage, u'video URL')
3073         video_url = compat_urllib_parse.unquote(video_url)
3074
3075         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3076             webpage, u'title')
3077
3078         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3079             webpage, u'thumbnail', fatal=False)
3080
3081         return [{
3082             'id': video_id,
3083             'url': video_url,
3084             'uploader': None,
3085             'upload_date': None,
3086             'title': video_title,
3087             'ext': 'flv',
3088             'thumbnail': video_thumbnail,
3089             'description': None,
3090         }]
3091
3092
3093 class GooglePlusIE(InfoExtractor):
3094     """Information extractor for plus.google.com."""
3095
3096     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3097     IE_NAME = u'plus.google'
3098
3099     def _real_extract(self, url):
3100         # Extract id from URL
3101         mobj = re.match(self._VALID_URL, url)
3102         if mobj is None:
3103             raise ExtractorError(u'Invalid URL: %s' % url)
3104
3105         post_url = mobj.group(0)
3106         video_id = mobj.group(1)
3107
3108         video_extension = 'flv'
3109
3110         # Step 1, Retrieve post webpage to extract further information
3111         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3112
3113         self.report_extraction(video_id)
3114
3115         # Extract update date
3116         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3117             webpage, u'upload date', fatal=False)
3118         if upload_date:
3119             # Convert timestring to a format suitable for filename
3120             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3121             upload_date = upload_date.strftime('%Y%m%d')
3122
3123         # Extract uploader
3124         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3125             webpage, u'uploader', fatal=False)
3126
3127         # Extract title
3128         # Get the first line for title
3129         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3130             webpage, 'title', default=u'NA')
3131
3132         # Step 2, Stimulate clicking the image box to launch video
3133         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3134             webpage, u'video page URL')
3135         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3136
3137         # Extract video links on video page
3138         """Extract video links of all sizes"""
3139         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3140         mobj = re.findall(pattern, webpage)
3141         if len(mobj) == 0:
3142             raise ExtractorError(u'Unable to extract video links')
3143
3144         # Sort in resolution
3145         links = sorted(mobj)
3146
3147         # Choose the lowest of the sort, i.e. highest resolution
3148         video_url = links[-1]
3149         # Only get the url. The resolution part in the tuple has no use anymore
3150         video_url = video_url[-1]
3151         # Treat escaped \u0026 style hex
3152         try:
3153             video_url = video_url.decode("unicode_escape")
3154         except AttributeError: # Python 3
3155             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3156
3157
3158         return [{
3159             'id':       video_id,
3160             'url':      video_url,
3161             'uploader': uploader,
3162             'upload_date':  upload_date,
3163             'title':    video_title,
3164             'ext':      video_extension,
3165         }]
3166
3167 class NBAIE(InfoExtractor):
3168     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3169     IE_NAME = u'nba'
3170
3171     def _real_extract(self, url):
3172         mobj = re.match(self._VALID_URL, url)
3173         if mobj is None:
3174             raise ExtractorError(u'Invalid URL: %s' % url)
3175
3176         video_id = mobj.group(1)
3177
3178         webpage = self._download_webpage(url, video_id)
3179
3180         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3181
3182         shortened_video_id = video_id.rpartition('/')[2]
3183         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3184             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3185
3186         # It isn't there in the HTML it returns to us
3187         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3188
3189         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3190
3191         info = {
3192             'id': shortened_video_id,
3193             'url': video_url,
3194             'ext': 'mp4',
3195             'title': title,
3196             # 'uploader_date': uploader_date,
3197             'description': description,
3198         }
3199         return [info]
3200
3201 class JustinTVIE(InfoExtractor):
3202     """Information extractor for justin.tv and twitch.tv"""
3203     # TODO: One broadcast may be split into multiple videos. The key
3204     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3205     # starts at 1 and increases. Can we treat all parts as one video?
3206
3207     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3208         (?:
3209             (?P<channelid>[^/]+)|
3210             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3211             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3212         )
3213         /?(?:\#.*)?$
3214         """
3215     _JUSTIN_PAGE_LIMIT = 100
3216     IE_NAME = u'justin.tv'
3217
3218     def report_download_page(self, channel, offset):
3219         """Report attempt to download a single page of videos."""
3220         self.to_screen(u'%s: Downloading video information from %d to %d' %
3221                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3222
3223     # Return count of items, list of *valid* items
3224     def _parse_page(self, url, video_id):
3225         webpage = self._download_webpage(url, video_id,
3226                                          u'Downloading video info JSON',
3227                                          u'unable to download video info JSON')
3228
3229         response = json.loads(webpage)
3230         if type(response) != list:
3231             error_text = response.get('error', 'unknown error')
3232             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3233         info = []
3234         for clip in response:
3235             video_url = clip['video_file_url']
3236             if video_url:
3237                 video_extension = os.path.splitext(video_url)[1][1:]
3238                 video_date = re.sub('-', '', clip['start_time'][:10])
3239                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3240                 video_id = clip['id']
3241                 video_title = clip.get('title', video_id)
3242                 info.append({
3243                     'id': video_id,
3244                     'url': video_url,
3245                     'title': video_title,
3246                     'uploader': clip.get('channel_name', video_uploader_id),
3247                     'uploader_id': video_uploader_id,
3248                     'upload_date': video_date,
3249                     'ext': video_extension,
3250                 })
3251         return (len(response), info)
3252
3253     def _real_extract(self, url):
3254         mobj = re.match(self._VALID_URL, url)
3255         if mobj is None:
3256             raise ExtractorError(u'invalid URL: %s' % url)
3257
3258         api_base = 'http://api.justin.tv'
3259         paged = False
3260         if mobj.group('channelid'):
3261             paged = True
3262             video_id = mobj.group('channelid')
3263             api = api_base + '/channel/archives/%s.json' % video_id
3264         elif mobj.group('chapterid'):
3265             chapter_id = mobj.group('chapterid')
3266
3267             webpage = self._download_webpage(url, chapter_id)
3268             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3269             if not m:
3270                 raise ExtractorError(u'Cannot find archive of a chapter')
3271             archive_id = m.group(1)
3272
3273             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3274             chapter_info_xml = self._download_webpage(api, chapter_id,
3275                                              note=u'Downloading chapter information',
3276                                              errnote=u'Chapter information download failed')
3277             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3278             for a in doc.findall('.//archive'):
3279                 if archive_id == a.find('./id').text:
3280                     break
3281             else:
3282                 raise ExtractorError(u'Could not find chapter in chapter information')
3283
3284             video_url = a.find('./video_file_url').text
3285             video_ext = video_url.rpartition('.')[2] or u'flv'
3286
3287             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3288             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3289                                    note='Downloading chapter metadata',
3290                                    errnote='Download of chapter metadata failed')
3291             chapter_info = json.loads(chapter_info_json)
3292
3293             bracket_start = int(doc.find('.//bracket_start').text)
3294             bracket_end = int(doc.find('.//bracket_end').text)
3295
3296             # TODO determine start (and probably fix up file)
3297             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3298             #video_url += u'?start=' + TODO:start_timestamp
3299             # bracket_start is 13290, but we want 51670615
3300             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3301                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3302
3303             info = {
3304                 'id': u'c' + chapter_id,
3305                 'url': video_url,
3306                 'ext': video_ext,
3307                 'title': chapter_info['title'],
3308                 'thumbnail': chapter_info['preview'],
3309                 'description': chapter_info['description'],
3310                 'uploader': chapter_info['channel']['display_name'],
3311                 'uploader_id': chapter_info['channel']['name'],
3312             }
3313             return [info]
3314         else:
3315             video_id = mobj.group('videoid')
3316             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3317
3318         self.report_extraction(video_id)
3319
3320         info = []
3321         offset = 0
3322         limit = self._JUSTIN_PAGE_LIMIT
3323         while True:
3324             if paged:
3325                 self.report_download_page(video_id, offset)
3326             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3327             page_count, page_info = self._parse_page(page_url, video_id)
3328             info.extend(page_info)
3329             if not paged or page_count != limit:
3330                 break
3331             offset += limit
3332         return info
3333
3334 class FunnyOrDieIE(InfoExtractor):
3335     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3336
3337     def _real_extract(self, url):
3338         mobj = re.match(self._VALID_URL, url)
3339         if mobj is None:
3340             raise ExtractorError(u'invalid URL: %s' % url)
3341
3342         video_id = mobj.group('id')
3343         webpage = self._download_webpage(url, video_id)
3344
3345         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3346             webpage, u'video URL', flags=re.DOTALL)
3347
3348         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3349             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3350
3351         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3352             webpage, u'description', fatal=False, flags=re.DOTALL)
3353
3354         info = {
3355             'id': video_id,
3356             'url': video_url,
3357             'ext': 'mp4',
3358             'title': title,
3359             'description': video_description,
3360         }
3361         return [info]
3362
3363 class SteamIE(InfoExtractor):
3364     _VALID_URL = r"""http://store\.steampowered\.com/
3365                 (agecheck/)?
3366                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3367                 (?P<gameID>\d+)/?
3368                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3369                 """
3370
3371     @classmethod
3372     def suitable(cls, url):
3373         """Receives a URL and returns True if suitable for this IE."""
3374         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3375
3376     def _real_extract(self, url):
3377         m = re.match(self._VALID_URL, url, re.VERBOSE)
3378         gameID = m.group('gameID')
3379         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3380         self.report_age_confirmation()
3381         webpage = self._download_webpage(videourl, gameID)
3382         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3383
3384         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3385         mweb = re.finditer(urlRE, webpage)
3386         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3387         titles = re.finditer(namesRE, webpage)
3388         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3389         thumbs = re.finditer(thumbsRE, webpage)
3390         videos = []
3391         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3392             video_id = vid.group('videoID')
3393             title = vtitle.group('videoName')
3394             video_url = vid.group('videoURL')
3395             video_thumb = thumb.group('thumbnail')
3396             if not video_url:
3397                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3398             info = {
3399                 'id':video_id,
3400                 'url':video_url,
3401                 'ext': 'flv',
3402                 'title': unescapeHTML(title),
3403                 'thumbnail': video_thumb
3404                   }
3405             videos.append(info)
3406         return [self.playlist_result(videos, gameID, game_title)]
3407
3408 class UstreamIE(InfoExtractor):
3409     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3410     IE_NAME = u'ustream'
3411
3412     def _real_extract(self, url):
3413         m = re.match(self._VALID_URL, url)
3414         video_id = m.group('videoID')
3415
3416         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3417         webpage = self._download_webpage(url, video_id)
3418
3419         self.report_extraction(video_id)
3420
3421         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3422             webpage, u'title')
3423
3424         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3425             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3426
3427         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3428             webpage, u'thumbnail', fatal=False)
3429
3430         info = {
3431                 'id': video_id,
3432                 'url': video_url,
3433                 'ext': 'flv',
3434                 'title': video_title,
3435                 'uploader': uploader,
3436                 'thumbnail': thumbnail,
3437                }
3438         return info
3439
3440 class WorldStarHipHopIE(InfoExtractor):
3441     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3442     IE_NAME = u'WorldStarHipHop'
3443
3444     def _real_extract(self, url):
3445         m = re.match(self._VALID_URL, url)
3446         video_id = m.group('id')
3447
3448         webpage_src = self._download_webpage(url, video_id)
3449
3450         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3451             webpage_src, u'video URL')
3452
3453         if 'mp4' in video_url:
3454             ext = 'mp4'
3455         else:
3456             ext = 'flv'
3457
3458         video_title = self._html_search_regex(r"<title>(.*)</title>",
3459             webpage_src, u'title')
3460
3461         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3462         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3463             webpage_src, u'thumbnail', fatal=False)
3464
3465         if not thumbnail:
3466             _title = r"""candytitles.*>(.*)</span>"""
3467             mobj = re.search(_title, webpage_src)
3468             if mobj is not None:
3469                 video_title = mobj.group(1)
3470
3471         results = [{
3472                     'id': video_id,
3473                     'url' : video_url,
3474                     'title' : video_title,
3475                     'thumbnail' : thumbnail,
3476                     'ext' : ext,
3477                     }]
3478         return results
3479
3480 class RBMARadioIE(InfoExtractor):
3481     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3482
3483     def _real_extract(self, url):
3484         m = re.match(self._VALID_URL, url)
3485         video_id = m.group('videoID')
3486
3487         webpage = self._download_webpage(url, video_id)
3488
3489         json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
3490             webpage, u'json data')
3491
3492         try:
3493             data = json.loads(json_data)
3494         except ValueError as e:
3495             raise ExtractorError(u'Invalid JSON: ' + str(e))
3496
3497         video_url = data['akamai_url'] + '&cbr=256'
3498         url_parts = compat_urllib_parse_urlparse(video_url)
3499         video_ext = url_parts.path.rpartition('.')[2]
3500         info = {
3501                 'id': video_id,
3502                 'url': video_url,
3503                 'ext': video_ext,
3504                 'title': data['title'],
3505                 'description': data.get('teaser_text'),
3506                 'location': data.get('country_of_origin'),
3507                 'uploader': data.get('host', {}).get('name'),
3508                 'uploader_id': data.get('host', {}).get('slug'),
3509                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3510                 'duration': data.get('duration'),
3511         }
3512         return [info]
3513
3514
3515 class YouPornIE(InfoExtractor):
3516     """Information extractor for youporn.com."""
3517     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3518
3519     def _print_formats(self, formats):
3520         """Print all available formats"""
3521         print(u'Available formats:')
3522         print(u'ext\t\tformat')
3523         print(u'---------------------------------')
3524         for format in formats:
3525             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3526
3527     def _specific(self, req_format, formats):
3528         for x in formats:
3529             if(x["format"]==req_format):
3530                 return x
3531         return None
3532
3533     def _real_extract(self, url):
3534         mobj = re.match(self._VALID_URL, url)
3535         if mobj is None:
3536             raise ExtractorError(u'Invalid URL: %s' % url)
3537         video_id = mobj.group('videoid')
3538
3539         req = compat_urllib_request.Request(url)
3540         req.add_header('Cookie', 'age_verified=1')
3541         webpage = self._download_webpage(req, video_id)
3542
3543         # Get JSON parameters
3544         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3545         try:
3546             params = json.loads(json_params)
3547         except:
3548             raise ExtractorError(u'Invalid JSON')
3549
3550         self.report_extraction(video_id)
3551         try:
3552             video_title = params['title']
3553             upload_date = unified_strdate(params['release_date_f'])
3554             video_description = params['description']
3555             video_uploader = params['submitted_by']
3556             thumbnail = params['thumbnails'][0]['image']
3557         except KeyError:
3558             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3559
3560         # Get all of the formats available
3561         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3562         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3563             webpage, u'download list').strip()
3564
3565         # Get all of the links from the page
3566         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3567         links = re.findall(LINK_RE, download_list_html)
3568         if(len(links) == 0):
3569             raise ExtractorError(u'ERROR: no known formats available for video')
3570
3571         self.to_screen(u'Links found: %d' % len(links))
3572
3573         formats = []
3574         for link in links:
3575
3576             # A link looks like this:
3577             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3578             # A path looks like this:
3579             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3580             video_url = unescapeHTML( link )
3581             path = compat_urllib_parse_urlparse( video_url ).path
3582             extension = os.path.splitext( path )[1][1:]
3583             format = path.split('/')[4].split('_')[:2]
3584             size = format[0]
3585             bitrate = format[1]
3586             format = "-".join( format )
3587             # title = u'%s-%s-%s' % (video_title, size, bitrate)
3588
3589             formats.append({
3590                 'id': video_id,
3591                 'url': video_url,
3592                 'uploader': video_uploader,
3593                 'upload_date': upload_date,
3594                 'title': video_title,
3595                 'ext': extension,
3596                 'format': format,
3597                 'thumbnail': thumbnail,
3598                 'description': video_description
3599             })
3600
3601         if self._downloader.params.get('listformats', None):
3602             self._print_formats(formats)
3603             return
3604
3605         req_format = self._downloader.params.get('format', None)
3606         self.to_screen(u'Format: %s' % req_format)
3607
3608         if req_format is None or req_format == 'best':
3609             return [formats[0]]
3610         elif req_format == 'worst':
3611             return [formats[-1]]
3612         elif req_format in ('-1', 'all'):
3613             return formats
3614         else:
3615             format = self._specific( req_format, formats )
3616             if result is None:
3617                 raise ExtractorError(u'Requested format not available')
3618             return [format]
3619
3620
3621
3622 class PornotubeIE(InfoExtractor):
3623     """Information extractor for pornotube.com."""
3624     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3625
3626     def _real_extract(self, url):
3627         mobj = re.match(self._VALID_URL, url)
3628         if mobj is None:
3629             raise ExtractorError(u'Invalid URL: %s' % url)
3630
3631         video_id = mobj.group('videoid')
3632         video_title = mobj.group('title')
3633
3634         # Get webpage content
3635         webpage = self._download_webpage(url, video_id)
3636
3637         # Get the video URL
3638         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3639         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3640         video_url = compat_urllib_parse.unquote(video_url)
3641
3642         #Get the uploaded date
3643         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3644         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3645         if upload_date: upload_date = unified_strdate(upload_date)
3646
3647         info = {'id': video_id,
3648                 'url': video_url,
3649                 'uploader': None,
3650                 'upload_date': upload_date,
3651                 'title': video_title,
3652                 'ext': 'flv',
3653                 'format': 'flv'}
3654
3655         return [info]
3656
3657 class YouJizzIE(InfoExtractor):
3658     """Information extractor for youjizz.com."""
3659     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3660
3661     def _real_extract(self, url):
3662         mobj = re.match(self._VALID_URL, url)
3663         if mobj is None:
3664             raise ExtractorError(u'Invalid URL: %s' % url)
3665
3666         video_id = mobj.group('videoid')
3667
3668         # Get webpage content
3669         webpage = self._download_webpage(url, video_id)
3670
3671         # Get the video title
3672         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3673             webpage, u'title').strip()
3674
3675         # Get the embed page
3676         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3677         if result is None:
3678             raise ExtractorError(u'ERROR: unable to extract embed page')
3679
3680         embed_page_url = result.group(0).strip()
3681         video_id = result.group('videoid')
3682
3683         webpage = self._download_webpage(embed_page_url, video_id)
3684
3685         # Get the video URL
3686         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3687             webpage, u'video URL')
3688
3689         info = {'id': video_id,
3690                 'url': video_url,
3691                 'title': video_title,
3692                 'ext': 'flv',
3693                 'format': 'flv',
3694                 'player_url': embed_page_url}
3695
3696         return [info]
3697
3698 class EightTracksIE(InfoExtractor):
3699     IE_NAME = '8tracks'
3700     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3701
3702     def _real_extract(self, url):
3703         mobj = re.match(self._VALID_URL, url)
3704         if mobj is None:
3705             raise ExtractorError(u'Invalid URL: %s' % url)
3706         playlist_id = mobj.group('id')
3707
3708         webpage = self._download_webpage(url, playlist_id)
3709
3710         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3711         data = json.loads(json_like)
3712
3713         session = str(random.randint(0, 1000000000))
3714         mix_id = data['id']
3715         track_count = data['tracks_count']
3716         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3717         next_url = first_url
3718         res = []
3719         for i in itertools.count():
3720             api_json = self._download_webpage(next_url, playlist_id,
3721                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3722                 errnote=u'Failed to download song information')
3723             api_data = json.loads(api_json)
3724             track_data = api_data[u'set']['track']
3725             info = {
3726                 'id': track_data['id'],
3727                 'url': track_data['track_file_stream_url'],
3728                 'title': track_data['performer'] + u' - ' + track_data['name'],
3729                 'raw_title': track_data['name'],
3730                 'uploader_id': data['user']['login'],
3731                 'ext': 'm4a',
3732             }
3733             res.append(info)
3734             if api_data['set']['at_last_track']:
3735                 break
3736             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3737         return res
3738
3739 class KeekIE(InfoExtractor):
3740     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3741     IE_NAME = u'keek'
3742
3743     def _real_extract(self, url):
3744         m = re.match(self._VALID_URL, url)
3745         video_id = m.group('videoID')
3746
3747         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3748         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3749         webpage = self._download_webpage(url, video_id)
3750
3751         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3752             webpage, u'title')
3753
3754         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3755             webpage, u'uploader', fatal=False)
3756
3757         info = {
3758                 'id': video_id,
3759                 'url': video_url,
3760                 'ext': 'mp4',
3761                 'title': video_title,
3762                 'thumbnail': thumbnail,
3763                 'uploader': uploader
3764         }
3765         return [info]
3766
3767 class TEDIE(InfoExtractor):
3768     _VALID_URL=r'''http://www\.ted\.com/
3769                    (
3770                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3771                         |
3772                         ((?P<type_talk>talks)) # We have a simple talk
3773                    )
3774                    (/lang/(.*?))? # The url may contain the language
3775                    /(?P<name>\w+) # Here goes the name and then ".html"
3776                    '''
3777
3778     @classmethod
3779     def suitable(cls, url):
3780         """Receives a URL and returns True if suitable for this IE."""
3781         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3782
3783     def _real_extract(self, url):
3784         m=re.match(self._VALID_URL, url, re.VERBOSE)
3785         if m.group('type_talk'):
3786             return [self._talk_info(url)]
3787         else :
3788             playlist_id=m.group('playlist_id')
3789             name=m.group('name')
3790             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3791             return [self._playlist_videos_info(url,name,playlist_id)]
3792
3793     def _talk_video_link(self,mediaSlug):
3794         '''Returns the video link for that mediaSlug'''
3795         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3796
3797     def _playlist_videos_info(self,url,name,playlist_id=0):
3798         '''Returns the videos of the playlist'''
3799         video_RE=r'''
3800                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3801                      ([.\s]*?)data-playlist_item_id="(\d+)"
3802                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3803                      '''
3804         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3805         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3806         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3807         m_names=re.finditer(video_name_RE,webpage)
3808
3809         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3810         m_playlist = re.search(playlist_RE, webpage)
3811         playlist_title = m_playlist.group('playlist_title')
3812
3813         playlist_entries = []
3814         for m_video, m_name in zip(m_videos,m_names):
3815             video_id=m_video.group('video_id')
3816             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3817             playlist_entries.append(self.url_result(talk_url, 'TED'))
3818         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3819
3820     def _talk_info(self, url, video_id=0):
3821         """Return the video for the talk in the url"""
3822         m=re.match(self._VALID_URL, url,re.VERBOSE)
3823         videoName=m.group('name')
3824         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3825         # If the url includes the language we get the title translated
3826         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3827         title=re.search(title_RE, webpage).group('title')
3828         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3829                         "id":(?P<videoID>[\d]+).*?
3830                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3831         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3832         thumb_match=re.search(thumb_RE,webpage)
3833         info_match=re.search(info_RE,webpage,re.VERBOSE)
3834         video_id=info_match.group('videoID')
3835         mediaSlug=info_match.group('mediaSlug')
3836         video_url=self._talk_video_link(mediaSlug)
3837         info = {
3838                 'id': video_id,
3839                 'url': video_url,
3840                 'ext': 'mp4',
3841                 'title': title,
3842                 'thumbnail': thumb_match.group('thumbnail')
3843                 }
3844         return info
3845
3846 class MySpassIE(InfoExtractor):
3847     _VALID_URL = r'http://www.myspass.de/.*'
3848
3849     def _real_extract(self, url):
3850         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3851
3852         # video id is the last path element of the URL
3853         # usually there is a trailing slash, so also try the second but last
3854         url_path = compat_urllib_parse_urlparse(url).path
3855         url_parent_path, video_id = os.path.split(url_path)
3856         if not video_id:
3857             _, video_id = os.path.split(url_parent_path)
3858
3859         # get metadata
3860         metadata_url = META_DATA_URL_TEMPLATE % video_id
3861         metadata_text = self._download_webpage(metadata_url, video_id)
3862         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3863
3864         # extract values from metadata
3865         url_flv_el = metadata.find('url_flv')
3866         if url_flv_el is None:
3867             raise ExtractorError(u'Unable to extract download url')
3868         video_url = url_flv_el.text
3869         extension = os.path.splitext(video_url)[1][1:]
3870         title_el = metadata.find('title')
3871         if title_el is None:
3872             raise ExtractorError(u'Unable to extract title')
3873         title = title_el.text
3874         format_id_el = metadata.find('format_id')
3875         if format_id_el is None:
3876             format = ext
3877         else:
3878             format = format_id_el.text
3879         description_el = metadata.find('description')
3880         if description_el is not None:
3881             description = description_el.text
3882         else:
3883             description = None
3884         imagePreview_el = metadata.find('imagePreview')
3885         if imagePreview_el is not None:
3886             thumbnail = imagePreview_el.text
3887         else:
3888             thumbnail = None
3889         info = {
3890             'id': video_id,
3891             'url': video_url,
3892             'title': title,
3893             'ext': extension,
3894             'format': format,
3895             'thumbnail': thumbnail,
3896             'description': description
3897         }
3898         return [info]
3899
3900 class SpiegelIE(InfoExtractor):
3901     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3902
3903     def _real_extract(self, url):
3904         m = re.match(self._VALID_URL, url)
3905         video_id = m.group('videoID')
3906
3907         webpage = self._download_webpage(url, video_id)
3908
3909         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3910             webpage, u'title')
3911
3912         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3913         xml_code = self._download_webpage(xml_url, video_id,
3914                     note=u'Downloading XML', errnote=u'Failed to download XML')
3915
3916         idoc = xml.etree.ElementTree.fromstring(xml_code)
3917         last_type = idoc[-1]
3918         filename = last_type.findall('./filename')[0].text
3919         duration = float(last_type.findall('./duration')[0].text)
3920
3921         video_url = 'http://video2.spiegel.de/flash/' + filename
3922         video_ext = filename.rpartition('.')[2]
3923         info = {
3924             'id': video_id,
3925             'url': video_url,
3926             'ext': video_ext,
3927             'title': video_title,
3928             'duration': duration,
3929         }
3930         return [info]
3931
3932 class LiveLeakIE(InfoExtractor):
3933
3934     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3935     IE_NAME = u'liveleak'
3936
3937     def _real_extract(self, url):
3938         mobj = re.match(self._VALID_URL, url)
3939         if mobj is None:
3940             raise ExtractorError(u'Invalid URL: %s' % url)
3941
3942         video_id = mobj.group('video_id')
3943
3944         webpage = self._download_webpage(url, video_id)
3945
3946         video_url = self._search_regex(r'file: "(.*?)",',
3947             webpage, u'video URL')
3948
3949         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3950             webpage, u'title').replace('LiveLeak.com -', '').strip()
3951
3952         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3953             webpage, u'description', fatal=False)
3954
3955         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3956             webpage, u'uploader', fatal=False)
3957
3958         info = {
3959             'id':  video_id,
3960             'url': video_url,
3961             'ext': 'mp4',
3962             'title': video_title,
3963             'description': video_description,
3964             'uploader': video_uploader
3965         }
3966
3967         return [info]
3968
3969 class ARDIE(InfoExtractor):
3970     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3971     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3972     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3973
3974     def _real_extract(self, url):
3975         # determine video id from url
3976         m = re.match(self._VALID_URL, url)
3977
3978         numid = re.search(r'documentId=([0-9]+)', url)
3979         if numid:
3980             video_id = numid.group(1)
3981         else:
3982             video_id = m.group('video_id')
3983
3984         # determine title and media streams from webpage
3985         html = self._download_webpage(url, video_id)
3986         title = re.search(self._TITLE, html).group('title')
3987         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3988         if not streams:
3989             assert '"fsk"' in html
3990             raise ExtractorError(u'This video is only available after 8:00 pm')
3991
3992         # choose default media type and highest quality for now
3993         stream = max([s for s in streams if int(s["media_type"]) == 0],
3994                      key=lambda s: int(s["quality"]))
3995
3996         # there's two possibilities: RTMP stream or HTTP download
3997         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3998         if stream['rtmp_url']:
3999             self.to_screen(u'RTMP download detected')
4000             assert stream['video_url'].startswith('mp4:')
4001             info["url"] = stream["rtmp_url"]
4002             info["play_path"] = stream['video_url']
4003         else:
4004             assert stream["video_url"].endswith('.mp4')
4005             info["url"] = stream["video_url"]
4006         return [info]
4007
4008 class TumblrIE(InfoExtractor):
4009     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4010
4011     def _real_extract(self, url):
4012         m_url = re.match(self._VALID_URL, url)
4013         video_id = m_url.group('id')
4014         blog = m_url.group('blog_name')
4015
4016         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4017         webpage = self._download_webpage(url, video_id)
4018
4019         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4020         video = re.search(re_video, webpage)
4021         if video is None:
4022            raise ExtractorError(u'Unable to extract video')
4023         video_url = video.group('video_url')
4024         ext = video.group('ext')
4025
4026         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4027             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4028         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4029
4030         # The only place where you can get a title, it's not complete,
4031         # but searching in other places doesn't work for all videos
4032         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4033             webpage, u'title', flags=re.DOTALL)
4034
4035         return [{'id': video_id,
4036                  'url': video_url,
4037                  'title': video_title,
4038                  'thumbnail': video_thumbnail,
4039                  'ext': ext
4040                  }]
4041
4042 class BandcampIE(InfoExtractor):
4043     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4044
4045     def _real_extract(self, url):
4046         mobj = re.match(self._VALID_URL, url)
4047         title = mobj.group('title')
4048         webpage = self._download_webpage(url, title)
4049         # We get the link to the free download page
4050         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4051         if m_download is None:
4052             raise ExtractorError(u'No free songs found')
4053
4054         download_link = m_download.group(1)
4055         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4056                        webpage, re.MULTILINE|re.DOTALL).group('id')
4057
4058         download_webpage = self._download_webpage(download_link, id,
4059                                                   'Downloading free downloads page')
4060         # We get the dictionary of the track from some javascrip code
4061         info = re.search(r'items: (.*?),$',
4062                          download_webpage, re.MULTILINE).group(1)
4063         info = json.loads(info)[0]
4064         # We pick mp3-320 for now, until format selection can be easily implemented.
4065         mp3_info = info[u'downloads'][u'mp3-320']
4066         # If we try to use this url it says the link has expired
4067         initial_url = mp3_info[u'url']
4068         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4069         m_url = re.match(re_url, initial_url)
4070         #We build the url we will use to get the final track url
4071         # This url is build in Bandcamp in the script download_bunde_*.js
4072         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4073         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4074         # If we could correctly generate the .rand field the url would be
4075         #in the "download_url" key
4076         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4077
4078         track_info = {'id':id,
4079                       'title' : info[u'title'],
4080                       'ext' :   'mp3',
4081                       'url' :   final_url,
4082                       'thumbnail' : info[u'thumb_url'],
4083                       'uploader' :  info[u'artist']
4084                       }
4085
4086         return [track_info]
4087
4088 class RedTubeIE(InfoExtractor):
4089     """Information Extractor for redtube"""
4090     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4091
4092     def _real_extract(self,url):
4093         mobj = re.match(self._VALID_URL, url)
4094         if mobj is None:
4095             raise ExtractorError(u'Invalid URL: %s' % url)
4096
4097         video_id = mobj.group('id')
4098         video_extension = 'mp4'
4099         webpage = self._download_webpage(url, video_id)
4100
4101         self.report_extraction(video_id)
4102
4103         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4104             webpage, u'video URL')
4105
4106         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4107             webpage, u'title')
4108
4109         return [{
4110             'id':       video_id,
4111             'url':      video_url,
4112             'ext':      video_extension,
4113             'title':    video_title,
4114         }]
4115
4116 class InaIE(InfoExtractor):
4117     """Information Extractor for Ina.fr"""
4118     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4119
4120     def _real_extract(self,url):
4121         mobj = re.match(self._VALID_URL, url)
4122
4123         video_id = mobj.group('id')
4124         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4125         video_extension = 'mp4'
4126         webpage = self._download_webpage(mrss_url, video_id)
4127
4128         self.report_extraction(video_id)
4129
4130         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4131             webpage, u'video URL')
4132
4133         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4134             webpage, u'title')
4135
4136         return [{
4137             'id':       video_id,
4138             'url':      video_url,
4139             'ext':      video_extension,
4140             'title':    video_title,
4141         }]
4142
4143 class HowcastIE(InfoExtractor):
4144     """Information Extractor for Howcast.com"""
4145     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4146
4147     def _real_extract(self, url):
4148         mobj = re.match(self._VALID_URL, url)
4149
4150         video_id = mobj.group('id')
4151         webpage_url = 'http://www.howcast.com/videos/' + video_id
4152         webpage = self._download_webpage(webpage_url, video_id)
4153
4154         self.report_extraction(video_id)
4155
4156         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4157             webpage, u'video URL')
4158
4159         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4160             webpage, u'title')
4161
4162         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4163             webpage, u'description', fatal=False)
4164
4165         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4166             webpage, u'thumbnail', fatal=False)
4167
4168         return [{
4169             'id':       video_id,
4170             'url':      video_url,
4171             'ext':      'mp4',
4172             'title':    video_title,
4173             'description': video_description,
4174             'thumbnail': thumbnail,
4175         }]
4176
4177 class VineIE(InfoExtractor):
4178     """Information Extractor for Vine.co"""
4179     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4180
4181     def _real_extract(self, url):
4182         mobj = re.match(self._VALID_URL, url)
4183
4184         video_id = mobj.group('id')
4185         webpage_url = 'https://vine.co/v/' + video_id
4186         webpage = self._download_webpage(webpage_url, video_id)
4187
4188         self.report_extraction(video_id)
4189
4190         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4191             webpage, u'video URL')
4192
4193         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4194             webpage, u'title')
4195
4196         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4197             webpage, u'thumbnail', fatal=False)
4198
4199         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4200             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4201
4202         return [{
4203             'id':        video_id,
4204             'url':       video_url,
4205             'ext':       'mp4',
4206             'title':     video_title,
4207             'thumbnail': thumbnail,
4208             'uploader':  uploader,
4209         }]
4210
4211 class FlickrIE(InfoExtractor):
4212     """Information Extractor for Flickr videos"""
4213     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4214
4215     def _real_extract(self, url):
4216         mobj = re.match(self._VALID_URL, url)
4217
4218         video_id = mobj.group('id')
4219         video_uploader_id = mobj.group('uploader_id')
4220         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4221         webpage = self._download_webpage(webpage_url, video_id)
4222
4223         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4224
4225         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4226         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4227
4228         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4229             first_xml, u'node_id')
4230
4231         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4232         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4233
4234         self.report_extraction(video_id)
4235
4236         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4237         if mobj is None:
4238             raise ExtractorError(u'Unable to extract video url')
4239         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4240
4241         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4242             webpage, u'video title')
4243
4244         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4245             webpage, u'description', fatal=False)
4246
4247         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4248             webpage, u'thumbnail', fatal=False)
4249
4250         return [{
4251             'id':          video_id,
4252             'url':         video_url,
4253             'ext':         'mp4',
4254             'title':       video_title,
4255             'description': video_description,
4256             'thumbnail':   thumbnail,
4257             'uploader_id': video_uploader_id,
4258         }]
4259
4260 class TeamcocoIE(InfoExtractor):
4261     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4262
4263     def _real_extract(self, url):
4264         mobj = re.match(self._VALID_URL, url)
4265         if mobj is None:
4266             raise ExtractorError(u'Invalid URL: %s' % url)
4267         url_title = mobj.group('url_title')
4268         webpage = self._download_webpage(url, url_title)
4269
4270         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4271             webpage, u'video id')
4272
4273         self.report_extraction(video_id)
4274
4275         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4276             webpage, u'title')
4277
4278         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4279             webpage, u'thumbnail', fatal=False)
4280
4281         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4282             webpage, u'description', fatal=False)
4283
4284         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4285         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4286
4287         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4288             data, u'video URL')
4289
4290         return [{
4291             'id':          video_id,
4292             'url':         video_url,
4293             'ext':         'mp4',
4294             'title':       video_title,
4295             'thumbnail':   thumbnail,
4296             'description': video_description,
4297         }]
4298
4299 class XHamsterIE(InfoExtractor):
4300     """Information Extractor for xHamster"""
4301     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4302
4303     def _real_extract(self,url):
4304         mobj = re.match(self._VALID_URL, url)
4305
4306         video_id = mobj.group('id')
4307         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4308         webpage = self._download_webpage(mrss_url, video_id)
4309
4310         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4311         if mobj is None:
4312             raise ExtractorError(u'Unable to extract media URL')
4313         if len(mobj.group('server')) == 0:
4314             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4315         else:
4316             video_url = mobj.group('server')+'/key='+mobj.group('file')
4317         video_extension = video_url.split('.')[-1]
4318
4319         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4320             webpage, u'title')
4321
4322         # Can't see the description anywhere in the UI
4323         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4324         #     webpage, u'description', fatal=False)
4325         # if video_description: video_description = unescapeHTML(video_description)
4326
4327         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4328         if mobj:
4329             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4330         else:
4331             video_upload_date = None
4332             self._downloader.report_warning(u'Unable to extract upload date')
4333
4334         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4335             webpage, u'uploader id', default=u'anonymous')
4336
4337         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4338             webpage, u'thumbnail', fatal=False)
4339
4340         return [{
4341             'id':       video_id,
4342             'url':      video_url,
4343             'ext':      video_extension,
4344             'title':    video_title,
4345             # 'description': video_description,
4346             'upload_date': video_upload_date,
4347             'uploader_id': video_uploader_id,
4348             'thumbnail': video_thumbnail
4349         }]
4350
4351 class HypemIE(InfoExtractor):
4352     """Information Extractor for hypem"""
4353     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4354
4355     def _real_extract(self, url):
4356         mobj = re.match(self._VALID_URL, url)
4357         if mobj is None:
4358             raise ExtractorError(u'Invalid URL: %s' % url)
4359         track_id = mobj.group(1)
4360
4361         data = { 'ax': 1, 'ts': time.time() }
4362         data_encoded = compat_urllib_parse.urlencode(data)
4363         complete_url = url + "?" + data_encoded
4364         request = compat_urllib_request.Request(complete_url)
4365         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4366         cookie = urlh.headers.get('Set-Cookie', '')
4367
4368         self.report_extraction(track_id)
4369
4370         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4371             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4372         try:
4373             track_list = json.loads(html_tracks)
4374             track = track_list[u'tracks'][0]
4375         except ValueError:
4376             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4377
4378         key = track[u"key"]
4379         track_id = track[u"id"]
4380         artist = track[u"artist"]
4381         title = track[u"song"]
4382
4383         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4384         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4385         request.add_header('cookie', cookie)
4386         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4387         try:
4388             song_data = json.loads(song_data_json)
4389         except ValueError:
4390             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4391         final_url = song_data[u"url"]
4392
4393         return [{
4394             'id':       track_id,
4395             'url':      final_url,
4396             'ext':      "mp3",
4397             'title':    title,
4398             'artist':   artist,
4399         }]
4400
4401
4402 def gen_extractors():
4403     """ Return a list of an instance of every supported extractor.
4404     The order does matter; the first extractor matched is the one handling the URL.
4405     """
4406     return [
4407         YoutubePlaylistIE(),
4408         YoutubeChannelIE(),
4409         YoutubeUserIE(),
4410         YoutubeSearchIE(),
4411         YoutubeIE(),
4412         MetacafeIE(),
4413         DailymotionIE(),
4414         GoogleSearchIE(),
4415         PhotobucketIE(),
4416         YahooIE(),
4417         YahooSearchIE(),
4418         DepositFilesIE(),
4419         FacebookIE(),
4420         BlipTVIE(),
4421         BlipTVUserIE(),
4422         VimeoIE(),
4423         MyVideoIE(),
4424         ComedyCentralIE(),
4425         EscapistIE(),
4426         CollegeHumorIE(),
4427         XVideosIE(),
4428         SoundcloudSetIE(),
4429         SoundcloudIE(),
4430         InfoQIE(),
4431         MixcloudIE(),
4432         StanfordOpenClassroomIE(),
4433         MTVIE(),
4434         YoukuIE(),
4435         XNXXIE(),
4436         YouJizzIE(),
4437         PornotubeIE(),
4438         YouPornIE(),
4439         GooglePlusIE(),
4440         ArteTvIE(),
4441         NBAIE(),
4442         WorldStarHipHopIE(),
4443         JustinTVIE(),
4444         FunnyOrDieIE(),
4445         SteamIE(),
4446         UstreamIE(),
4447         RBMARadioIE(),
4448         EightTracksIE(),
4449         KeekIE(),
4450         TEDIE(),
4451         MySpassIE(),
4452         SpiegelIE(),
4453         LiveLeakIE(),
4454         ARDIE(),
4455         TumblrIE(),
4456         BandcampIE(),
4457         RedTubeIE(),
4458         InaIE(),
4459         HowcastIE(),
4460         VineIE(),
4461         FlickrIE(),
4462         TeamcocoIE(),
4463         XHamsterIE(),
4464         HypemIE(),
4465         GenericIE()
4466     ]
4467
4468 def get_info_extractor(ie_name):
4469     """Returns the info extractor class with the given ie_name"""
4470     return globals()[ie_name+'IE']