youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 195         """
 196         Perform a regex search on the given string, using a single or a list of
 197         patterns returning the first matching group.
 198         In case of failure return a default value or raise a WARNING or a
 199         ExtractorError, depending on fatal, specifying the field name.
 200         """
 201         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 202             mobj = re.search(pattern, string, flags)
 203         else:
 204             for p in pattern:
 205                 mobj = re.search(p, string, flags)
 206                 if mobj: break
 207
 208         if sys.stderr.isatty() and os.name != 'nt':
 209             _name = u'\033[0;34m%s\033[0m' % name
 210         else:
 211             _name = name
 212
 213         if mobj:
 214             # return the first matching group
 215             return next(g for g in mobj.groups() if g is not None)
 216         elif default is not None:
 217             return default
 218         elif fatal:
 219             raise ExtractorError(u'Unable to extract %s' % _name)
 220         else:
 221             self._downloader.report_warning(u'unable to extract %s; '
 222                 u'please report this issue on GitHub.' % _name)
 223             return None
 224
 225     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 226         """
 227         Like _search_regex, but strips HTML tags and unescapes entities.
 228         """
 229         res = self._search_regex(pattern, string, name, default, fatal, flags)
 230         if res:
 231             return clean_html(res).strip()
 232         else:
 233             return res
 234
 235 class SearchInfoExtractor(InfoExtractor):
 236     """
 237     Base class for paged search queries extractors.
 238     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 239     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 240     """
 241
 242     @classmethod
 243     def _make_valid_url(cls):
 244         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 245
 246     @classmethod
 247     def suitable(cls, url):
 248         return re.match(cls._make_valid_url(), url) is not None
 249
 250     def _real_extract(self, query):
 251         mobj = re.match(self._make_valid_url(), query)
 252         if mobj is None:
 253             raise ExtractorError(u'Invalid search query "%s"' % query)
 254
 255         prefix = mobj.group('prefix')
 256         query = mobj.group('query')
 257         if prefix == '':
 258             return self._get_n_results(query, 1)
 259         elif prefix == 'all':
 260             return self._get_n_results(query, self._MAX_RESULTS)
 261         else:
 262             n = int(prefix)
 263             if n <= 0:
 264                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 265             elif n > self._MAX_RESULTS:
 266                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 267                 n = self._MAX_RESULTS
 268             return self._get_n_results(query, n)
 269
 270     def _get_n_results(self, query, n):
 271         """Get a specified number of results for a query"""
 272         raise NotImplementedError("This method must be implemented by sublclasses")
 273
 274
 275 class YoutubeIE(InfoExtractor):
 276     """Information extractor for youtube.com."""
 277
 278     _VALID_URL = r"""^
 279                      (
 280                          (?:https?://)?                                       # http(s):// (optional)
 281                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 282                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 283                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 284                          (?:                                                  # the various things that can precede the ID:
 285                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 286                              |(?:                                             # or the v= param in all its forms
 287                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 288                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 289                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 290                                  v=
 291                              )
 292                          )?                                                   # optional -> youtube.com/xxxx is OK
 293                      )?                                                       # all until now is optional -> you can pass the naked ID
 294                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 295                      (?(1).+)?                                                # if we found the ID, everything can follow
 296                      $"""
 297     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 298     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 299     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 300     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 301     _NETRC_MACHINE = 'youtube'
 302     # Listed in order of quality
 303     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 304     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 305     _video_extensions = {
 306         '13': '3gp',
 307         '17': 'mp4',
 308         '18': 'mp4',
 309         '22': 'mp4',
 310         '37': 'mp4',
 311         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 312         '43': 'webm',
 313         '44': 'webm',
 314         '45': 'webm',
 315         '46': 'webm',
 316     }
 317     _video_dimensions = {
 318         '5': '240x400',
 319         '6': '???',
 320         '13': '???',
 321         '17': '144x176',
 322         '18': '360x640',
 323         '22': '720x1280',
 324         '34': '360x640',
 325         '35': '480x854',
 326         '37': '1080x1920',
 327         '38': '3072x4096',
 328         '43': '360x640',
 329         '44': '480x854',
 330         '45': '720x1280',
 331         '46': '1080x1920',
 332     }
 333     IE_NAME = u'youtube'
 334
 335     @classmethod
 336     def suitable(cls, url):
 337         """Receives a URL and returns True if suitable for this IE."""
 338         if YoutubePlaylistIE.suitable(url): return False
 339         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 340
 341     def report_lang(self):
 342         """Report attempt to set language."""
 343         self.to_screen(u'Setting language')
 344
 345     def report_login(self):
 346         """Report attempt to log in."""
 347         self.to_screen(u'Logging in')
 348
 349     def report_video_webpage_download(self, video_id):
 350         """Report attempt to download video webpage."""
 351         self.to_screen(u'%s: Downloading video webpage' % video_id)
 352
 353     def report_video_info_webpage_download(self, video_id):
 354         """Report attempt to download video info webpage."""
 355         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 356
 357     def report_video_subtitles_download(self, video_id):
 358         """Report attempt to download video info webpage."""
 359         self.to_screen(u'%s: Checking available subtitles' % video_id)
 360
 361     def report_video_subtitles_request(self, video_id, sub_lang, format):
 362         """Report attempt to download video info webpage."""
 363         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 364
 365     def report_video_subtitles_available(self, video_id, sub_lang_list):
 366         """Report available subtitles."""
 367         sub_lang = ",".join(list(sub_lang_list.keys()))
 368         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 369
 370     def report_information_extraction(self, video_id):
 371         """Report attempt to extract video information."""
 372         self.to_screen(u'%s: Extracting video information' % video_id)
 373
 374     def report_unavailable_format(self, video_id, format):
 375         """Report extracted video URL."""
 376         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 377
 378     def report_rtmp_download(self):
 379         """Indicate the download will use the RTMP protocol."""
 380         self.to_screen(u'RTMP download detected')
 381
 382     def _get_available_subtitles(self, video_id):
 383         self.report_video_subtitles_download(video_id)
 384         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 385         try:
 386             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 388             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 389         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 390         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 391         if not sub_lang_list:
 392             return (u'video doesn\'t have subtitles', None)
 393         return sub_lang_list
 394
 395     def _list_available_subtitles(self, video_id):
 396         sub_lang_list = self._get_available_subtitles(video_id)
 397         self.report_video_subtitles_available(video_id, sub_lang_list)
 398
 399     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 400         """
 401         Return tuple:
 402         (error_message, sub_lang, sub)
 403         """
 404         self.report_video_subtitles_request(video_id, sub_lang, format)
 405         params = compat_urllib_parse.urlencode({
 406             'lang': sub_lang,
 407             'name': sub_name,
 408             'v': video_id,
 409             'fmt': format,
 410         })
 411         url = 'http://www.youtube.com/api/timedtext?' + params
 412         try:
 413             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 416         if not sub:
 417             return (u'Did not fetch video subtitles', None, None)
 418         return (None, sub_lang, sub)
 419
 420     def _request_automatic_caption(self, video_id, webpage):
 421         """We need the webpage for getting the captions url, pass it as an
 422            argument to speed up the process."""
 423         sub_lang = self._downloader.params.get('subtitleslang') or 'en'
 424         sub_format = self._downloader.params.get('subtitlesformat')
 425         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 426         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 427         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 428         if mobj is None:
 429             return [(err_msg, None, None)]
 430         player_config = json.loads(mobj.group(1))
 431         try:
 432             args = player_config[u'args']
 433             caption_url = args[u'ttsurl']
 434             timestamp = args[u'timestamp']
 435             params = compat_urllib_parse.urlencode({
 436                 'lang': 'en',
 437                 'tlang': sub_lang,
 438                 'fmt': sub_format,
 439                 'ts': timestamp,
 440                 'kind': 'asr',
 441             })
 442             subtitles_url = caption_url + '&' + params
 443             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 444             return [(None, sub_lang, sub)]
 445         except KeyError:
 446             return [(err_msg, None, None)]
 447
 448     def _extract_subtitle(self, video_id):
 449         """
 450         Return a list with a tuple:
 451         [(error_message, sub_lang, sub)]
 452         """
 453         sub_lang_list = self._get_available_subtitles(video_id)
 454         sub_format = self._downloader.params.get('subtitlesformat')
 455         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 456             return [(sub_lang_list[0], None, None)]
 457         if self._downloader.params.get('subtitleslang', False):
 458             sub_lang = self._downloader.params.get('subtitleslang')
 459         elif 'en' in sub_lang_list:
 460             sub_lang = 'en'
 461         else:
 462             sub_lang = list(sub_lang_list.keys())[0]
 463         if not sub_lang in sub_lang_list:
 464             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 465
 466         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 467         return [subtitle]
 468
 469     def _extract_all_subtitles(self, video_id):
 470         sub_lang_list = self._get_available_subtitles(video_id)
 471         sub_format = self._downloader.params.get('subtitlesformat')
 472         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 473             return [(sub_lang_list[0], None, None)]
 474         subtitles = []
 475         for sub_lang in sub_lang_list:
 476             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 477             subtitles.append(subtitle)
 478         return subtitles
 479
 480     def _print_formats(self, formats):
 481         print('Available formats:')
 482         for x in formats:
 483             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 484
 485     def _real_initialize(self):
 486         if self._downloader is None:
 487             return
 488
 489         username = None
 490         password = None
 491         downloader_params = self._downloader.params
 492
 493         # Attempt to use provided username and password or .netrc data
 494         if downloader_params.get('username', None) is not None:
 495             username = downloader_params['username']
 496             password = downloader_params['password']
 497         elif downloader_params.get('usenetrc', False):
 498             try:
 499                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 500                 if info is not None:
 501                     username = info[0]
 502                     password = info[2]
 503                 else:
 504                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 505             except (IOError, netrc.NetrcParseError) as err:
 506                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 507                 return
 508
 509         # Set language
 510         request = compat_urllib_request.Request(self._LANG_URL)
 511         try:
 512             self.report_lang()
 513             compat_urllib_request.urlopen(request).read()
 514         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 515             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 516             return
 517
 518         # No authentication to be performed
 519         if username is None:
 520             return
 521
 522         request = compat_urllib_request.Request(self._LOGIN_URL)
 523         try:
 524             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 525         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 526             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 527             return
 528
 529         galx = None
 530         dsh = None
 531         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 532         if match:
 533           galx = match.group(1)
 534
 535         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 536         if match:
 537           dsh = match.group(1)
 538
 539         # Log in
 540         login_form_strs = {
 541                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 542                 u'Email': username,
 543                 u'GALX': galx,
 544                 u'Passwd': password,
 545                 u'PersistentCookie': u'yes',
 546                 u'_utf8': u'霱',
 547                 u'bgresponse': u'js_disabled',
 548                 u'checkConnection': u'',
 549                 u'checkedDomains': u'youtube',
 550                 u'dnConn': u'',
 551                 u'dsh': dsh,
 552                 u'pstMsg': u'0',
 553                 u'rmShown': u'1',
 554                 u'secTok': u'',
 555                 u'signIn': u'Sign in',
 556                 u'timeStmp': u'',
 557                 u'service': u'youtube',
 558                 u'uilel': u'3',
 559                 u'hl': u'en_US',
 560         }
 561         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 562         # chokes on unicode
 563         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 564         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 565         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 566         try:
 567             self.report_login()
 568             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 569             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 570                 self._downloader.report_warning(u'unable to log in: bad username or password')
 571                 return
 572         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 573             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 574             return
 575
 576         # Confirm age
 577         age_form = {
 578                 'next_url':     '/',
 579                 'action_confirm':   'Confirm',
 580                 }
 581         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 582         try:
 583             self.report_age_confirmation()
 584             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 585         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 586             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 587
 588     def _extract_id(self, url):
 589         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 590         if mobj is None:
 591             raise ExtractorError(u'Invalid URL: %s' % url)
 592         video_id = mobj.group(2)
 593         return video_id
 594
 595     def _real_extract(self, url):
 596         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 597         mobj = re.search(self._NEXT_URL_RE, url)
 598         if mobj:
 599             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 600         video_id = self._extract_id(url)
 601
 602         # Get video webpage
 603         self.report_video_webpage_download(video_id)
 604         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 605         request = compat_urllib_request.Request(url)
 606         try:
 607             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 608         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 609             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 610
 611         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 612
 613         # Attempt to extract SWF player URL
 614         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 615         if mobj is not None:
 616             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 617         else:
 618             player_url = None
 619
 620         # Get video info
 621         self.report_video_info_webpage_download(video_id)
 622         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 623             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 624                     % (video_id, el_type))
 625             video_info_webpage = self._download_webpage(video_info_url, video_id,
 626                                     note=False,
 627                                     errnote='unable to download video info webpage')
 628             video_info = compat_parse_qs(video_info_webpage)
 629             if 'token' in video_info:
 630                 break
 631         if 'token' not in video_info:
 632             if 'reason' in video_info:
 633                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 634             else:
 635                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 636
 637         # Check for "rental" videos
 638         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 639             raise ExtractorError(u'"rental" videos not supported')
 640
 641         # Start extracting information
 642         self.report_information_extraction(video_id)
 643
 644         # uploader
 645         if 'author' not in video_info:
 646             raise ExtractorError(u'Unable to extract uploader name')
 647         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 648
 649         # uploader_id
 650         video_uploader_id = None
 651         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 652         if mobj is not None:
 653             video_uploader_id = mobj.group(1)
 654         else:
 655             self._downloader.report_warning(u'unable to extract uploader nickname')
 656
 657         # title
 658         if 'title' not in video_info:
 659             raise ExtractorError(u'Unable to extract video title')
 660         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 661
 662         # thumbnail image
 663         if 'thumbnail_url' not in video_info:
 664             self._downloader.report_warning(u'unable to extract video thumbnail')
 665             video_thumbnail = ''
 666         else:   # don't panic if we can't find it
 667             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 668
 669         # upload date
 670         upload_date = None
 671         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 672         if mobj is not None:
 673             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 674             upload_date = unified_strdate(upload_date)
 675
 676         # description
 677         video_description = get_element_by_id("eow-description", video_webpage)
 678         if video_description:
 679             video_description = clean_html(video_description)
 680         else:
 681             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 682             if fd_mobj:
 683                 video_description = unescapeHTML(fd_mobj.group(1))
 684             else:
 685                 video_description = u''
 686
 687         # subtitles
 688         video_subtitles = None
 689
 690         if self._downloader.params.get('writesubtitles', False):
 691             video_subtitles = self._extract_subtitle(video_id)
 692             if video_subtitles:
 693                 (sub_error, sub_lang, sub) = video_subtitles[0]
 694                 if sub_error:
 695                     # We try with the automatic captions
 696                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 697                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 698                     if sub is not None:
 699                         pass
 700                     else:
 701                         # We report the original error
 702                         self._downloader.report_error(sub_error)
 703
 704         if self._downloader.params.get('allsubtitles', False):
 705             video_subtitles = self._extract_all_subtitles(video_id)
 706             for video_subtitle in video_subtitles:
 707                 (sub_error, sub_lang, sub) = video_subtitle
 708                 if sub_error:
 709                     self._downloader.report_error(sub_error)
 710
 711         if self._downloader.params.get('listsubtitles', False):
 712             sub_lang_list = self._list_available_subtitles(video_id)
 713             return
 714
 715         if 'length_seconds' not in video_info:
 716             self._downloader.report_warning(u'unable to extract video duration')
 717             video_duration = ''
 718         else:
 719             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 720
 721         # token
 722         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 723
 724         # Decide which formats to download
 725         req_format = self._downloader.params.get('format', None)
 726
 727         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 728             self.report_rtmp_download()
 729             video_url_list = [(None, video_info['conn'][0])]
 730         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 731             url_map = {}
 732             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 733                 url_data = compat_parse_qs(url_data_str)
 734                 if 'itag' in url_data and 'url' in url_data:
 735                     url = url_data['url'][0]
 736                     if 'sig' in url_data:
 737                         url += '&signature=' + url_data['sig'][0]
 738                     if 'ratebypass' not in url:
 739                         url += '&ratebypass=yes'
 740                     url_map[url_data['itag'][0]] = url
 741
 742             format_limit = self._downloader.params.get('format_limit', None)
 743             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 744             if format_limit is not None and format_limit in available_formats:
 745                 format_list = available_formats[available_formats.index(format_limit):]
 746             else:
 747                 format_list = available_formats
 748             existing_formats = [x for x in format_list if x in url_map]
 749             if len(existing_formats) == 0:
 750                 raise ExtractorError(u'no known formats available for video')
 751             if self._downloader.params.get('listformats', None):
 752                 self._print_formats(existing_formats)
 753                 return
 754             if req_format is None or req_format == 'best':
 755                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 756             elif req_format == 'worst':
 757                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 758             elif req_format in ('-1', 'all'):
 759                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 760             else:
 761                 # Specific formats. We pick the first in a slash-delimeted sequence.
 762                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 763                 req_formats = req_format.split('/')
 764                 video_url_list = None
 765                 for rf in req_formats:
 766                     if rf in url_map:
 767                         video_url_list = [(rf, url_map[rf])]
 768                         break
 769                 if video_url_list is None:
 770                     raise ExtractorError(u'requested format not available')
 771         else:
 772             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 773
 774         results = []
 775         for format_param, video_real_url in video_url_list:
 776             # Extension
 777             video_extension = self._video_extensions.get(format_param, 'flv')
 778
 779             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 780                                               self._video_dimensions.get(format_param, '???'))
 781
 782             results.append({
 783                 'id':       video_id,
 784                 'url':      video_real_url,
 785                 'uploader': video_uploader,
 786                 'uploader_id': video_uploader_id,
 787                 'upload_date':  upload_date,
 788                 'title':    video_title,
 789                 'ext':      video_extension,
 790                 'format':   video_format,
 791                 'thumbnail':    video_thumbnail,
 792                 'description':  video_description,
 793                 'player_url':   player_url,
 794                 'subtitles':    video_subtitles,
 795                 'duration':     video_duration
 796             })
 797         return results
 798
 799
 800 class MetacafeIE(InfoExtractor):
 801     """Information Extractor for metacafe.com."""
 802
 803     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 804     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 805     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 806     IE_NAME = u'metacafe'
 807
 808     def report_disclaimer(self):
 809         """Report disclaimer retrieval."""
 810         self.to_screen(u'Retrieving disclaimer')
 811
 812     def _real_initialize(self):
 813         # Retrieve disclaimer
 814         request = compat_urllib_request.Request(self._DISCLAIMER)
 815         try:
 816             self.report_disclaimer()
 817             disclaimer = compat_urllib_request.urlopen(request).read()
 818         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 819             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 820
 821         # Confirm age
 822         disclaimer_form = {
 823             'filters': '0',
 824             'submit': "Continue - I'm over 18",
 825             }
 826         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 827         try:
 828             self.report_age_confirmation()
 829             disclaimer = compat_urllib_request.urlopen(request).read()
 830         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 831             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 832
 833     def _real_extract(self, url):
 834         # Extract id and simplified title from URL
 835         mobj = re.match(self._VALID_URL, url)
 836         if mobj is None:
 837             raise ExtractorError(u'Invalid URL: %s' % url)
 838
 839         video_id = mobj.group(1)
 840
 841         # Check if video comes from YouTube
 842         mobj2 = re.match(r'^yt-(.*)$', video_id)
 843         if mobj2 is not None:
 844             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 845
 846         # Retrieve video webpage to extract further information
 847         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 848
 849         # Extract URL, uploader and title from webpage
 850         self.report_extraction(video_id)
 851         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 852         if mobj is not None:
 853             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 854             video_extension = mediaURL[-3:]
 855
 856             # Extract gdaKey if available
 857             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 858             if mobj is None:
 859                 video_url = mediaURL
 860             else:
 861                 gdaKey = mobj.group(1)
 862                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 863         else:
 864             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 865             if mobj is None:
 866                 raise ExtractorError(u'Unable to extract media URL')
 867             vardict = compat_parse_qs(mobj.group(1))
 868             if 'mediaData' not in vardict:
 869                 raise ExtractorError(u'Unable to extract media URL')
 870             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 871             if mobj is None:
 872                 raise ExtractorError(u'Unable to extract media URL')
 873             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 874             video_extension = mediaURL[-3:]
 875             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 876
 877         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 878         if mobj is None:
 879             raise ExtractorError(u'Unable to extract title')
 880         video_title = mobj.group(1).decode('utf-8')
 881
 882         mobj = re.search(r'submitter=(.*?);', webpage)
 883         if mobj is None:
 884             raise ExtractorError(u'Unable to extract uploader nickname')
 885         video_uploader = mobj.group(1)
 886
 887         return [{
 888             'id':       video_id.decode('utf-8'),
 889             'url':      video_url.decode('utf-8'),
 890             'uploader': video_uploader.decode('utf-8'),
 891             'upload_date':  None,
 892             'title':    video_title,
 893             'ext':      video_extension.decode('utf-8'),
 894         }]
 895
 896 class DailymotionIE(InfoExtractor):
 897     """Information Extractor for Dailymotion"""
 898
 899     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 900     IE_NAME = u'dailymotion'
 901
 902     def _real_extract(self, url):
 903         # Extract id and simplified title from URL
 904         mobj = re.match(self._VALID_URL, url)
 905         if mobj is None:
 906             raise ExtractorError(u'Invalid URL: %s' % url)
 907
 908         video_id = mobj.group(1).split('_')[0].split('?')[0]
 909
 910         video_extension = 'mp4'
 911
 912         # Retrieve video webpage to extract further information
 913         request = compat_urllib_request.Request(url)
 914         request.add_header('Cookie', 'family_filter=off')
 915         webpage = self._download_webpage(request, video_id)
 916
 917         # Extract URL, uploader and title from webpage
 918         self.report_extraction(video_id)
 919         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 920         if mobj is None:
 921             raise ExtractorError(u'Unable to extract media URL')
 922         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 923
 924         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 925             if key in flashvars:
 926                 max_quality = key
 927                 self.to_screen(u'Using %s' % key)
 928                 break
 929         else:
 930             raise ExtractorError(u'Unable to extract video URL')
 931
 932         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 933         if mobj is None:
 934             raise ExtractorError(u'Unable to extract video URL')
 935
 936         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 937
 938         # TODO: support choosing qualities
 939
 940         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 941         if mobj is None:
 942             raise ExtractorError(u'Unable to extract title')
 943         video_title = unescapeHTML(mobj.group('title'))
 944
 945         video_uploader = None
 946         video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
 947                                              # Looking for official user
 948                                              r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
 949                                             webpage, 'video uploader')
 950
 951         video_upload_date = None
 952         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 953         if mobj is not None:
 954             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 955
 956         return [{
 957             'id':       video_id,
 958             'url':      video_url,
 959             'uploader': video_uploader,
 960             'upload_date':  video_upload_date,
 961             'title':    video_title,
 962             'ext':      video_extension,
 963         }]
 964
 965
 966 class PhotobucketIE(InfoExtractor):
 967     """Information extractor for photobucket.com."""
 968
 969     # TODO: the original _VALID_URL was:
 970     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 971     # Check if it's necessary to keep the old extracion process
 972     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 973     IE_NAME = u'photobucket'
 974
 975     def _real_extract(self, url):
 976         # Extract id from URL
 977         mobj = re.match(self._VALID_URL, url)
 978         if mobj is None:
 979             raise ExtractorError(u'Invalid URL: %s' % url)
 980
 981         video_id = mobj.group('id')
 982
 983         video_extension = mobj.group('ext')
 984
 985         # Retrieve video webpage to extract further information
 986         webpage = self._download_webpage(url, video_id)
 987
 988         # Extract URL, uploader, and title from webpage
 989         self.report_extraction(video_id)
 990         # We try first by looking the javascript code:
 991         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 992         if mobj is not None:
 993             info = json.loads(mobj.group('json'))
 994             return [{
 995                 'id':       video_id,
 996                 'url':      info[u'downloadUrl'],
 997                 'uploader': info[u'username'],
 998                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 999                 'title':    info[u'title'],
1000                 'ext':      video_extension,
1001                 'thumbnail': info[u'thumbUrl'],
1002             }]
1003
1004         # We try looking in other parts of the webpage
1005         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1006             webpage, u'video URL')
1007
1008         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1009         if mobj is None:
1010             raise ExtractorError(u'Unable to extract title')
1011         video_title = mobj.group(1).decode('utf-8')
1012         video_uploader = mobj.group(2).decode('utf-8')
1013
1014         return [{
1015             'id':       video_id.decode('utf-8'),
1016             'url':      video_url.decode('utf-8'),
1017             'uploader': video_uploader,
1018             'upload_date':  None,
1019             'title':    video_title,
1020             'ext':      video_extension.decode('utf-8'),
1021         }]
1022
1023
1024 class YahooIE(InfoExtractor):
1025     """Information extractor for screen.yahoo.com."""
1026     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1027
1028     def _real_extract(self, url):
1029         mobj = re.match(self._VALID_URL, url)
1030         if mobj is None:
1031             raise ExtractorError(u'Invalid URL: %s' % url)
1032         video_id = mobj.group('id')
1033         webpage = self._download_webpage(url, video_id)
1034         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1035
1036         if m_id is None:
1037             # TODO: Check which url parameters are required
1038             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1039             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1040             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1041                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1042                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1043                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1044                         '''
1045             self.report_extraction(video_id)
1046             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1047             if m_info is None:
1048                 raise ExtractorError(u'Unable to extract video info')
1049             video_title = m_info.group('title')
1050             video_description = m_info.group('description')
1051             video_thumb = m_info.group('thumb')
1052             video_date = m_info.group('date')
1053             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1054
1055             # TODO: Find a way to get mp4 videos
1056             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1057             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1058             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1059             video_url = m_rest.group('url')
1060             video_path = m_rest.group('path')
1061             if m_rest is None:
1062                 raise ExtractorError(u'Unable to extract video url')
1063
1064         else: # We have to use a different method if another id is defined
1065             long_id = m_id.group('new_id')
1066             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1067             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1068             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1069             info = json.loads(json_str)
1070             res = info[u'query'][u'results'][u'mediaObj'][0]
1071             stream = res[u'streams'][0]
1072             video_path = stream[u'path']
1073             video_url = stream[u'host']
1074             meta = res[u'meta']
1075             video_title = meta[u'title']
1076             video_description = meta[u'description']
1077             video_thumb = meta[u'thumbnail']
1078             video_date = None # I can't find it
1079
1080         info_dict = {
1081                      'id': video_id,
1082                      'url': video_url,
1083                      'play_path': video_path,
1084                      'title':video_title,
1085                      'description': video_description,
1086                      'thumbnail': video_thumb,
1087                      'upload_date': video_date,
1088                      'ext': 'flv',
1089                      }
1090         return info_dict
1091
1092 class VimeoIE(InfoExtractor):
1093     """Information extractor for vimeo.com."""
1094
1095     # _VALID_URL matches Vimeo URLs
1096     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1097     IE_NAME = u'vimeo'
1098
1099     def _real_extract(self, url, new_video=True):
1100         # Extract ID from URL
1101         mobj = re.match(self._VALID_URL, url)
1102         if mobj is None:
1103             raise ExtractorError(u'Invalid URL: %s' % url)
1104
1105         video_id = mobj.group('id')
1106         if not mobj.group('proto'):
1107             url = 'https://' + url
1108         if mobj.group('direct_link') or mobj.group('pro'):
1109             url = 'https://vimeo.com/' + video_id
1110
1111         # Retrieve video webpage to extract further information
1112         request = compat_urllib_request.Request(url, None, std_headers)
1113         webpage = self._download_webpage(request, video_id)
1114
1115         # Now we begin extracting as much information as we can from what we
1116         # retrieved. First we extract the information common to all extractors,
1117         # and latter we extract those that are Vimeo specific.
1118         self.report_extraction(video_id)
1119
1120         # Extract the config JSON
1121         try:
1122             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1123             config = json.loads(config)
1124         except:
1125             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1126                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1127             else:
1128                 raise ExtractorError(u'Unable to extract info section')
1129
1130         # Extract title
1131         video_title = config["video"]["title"]
1132
1133         # Extract uploader and uploader_id
1134         video_uploader = config["video"]["owner"]["name"]
1135         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1136
1137         # Extract video thumbnail
1138         video_thumbnail = config["video"]["thumbnail"]
1139
1140         # Extract video description
1141         video_description = get_element_by_attribute("itemprop", "description", webpage)
1142         if video_description: video_description = clean_html(video_description)
1143         else: video_description = u''
1144
1145         # Extract upload date
1146         video_upload_date = None
1147         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1148         if mobj is not None:
1149             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1150
1151         # Vimeo specific: extract request signature and timestamp
1152         sig = config['request']['signature']
1153         timestamp = config['request']['timestamp']
1154
1155         # Vimeo specific: extract video codec and quality information
1156         # First consider quality, then codecs, then take everything
1157         # TODO bind to format param
1158         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1159         files = { 'hd': [], 'sd': [], 'other': []}
1160         for codec_name, codec_extension in codecs:
1161             if codec_name in config["video"]["files"]:
1162                 if 'hd' in config["video"]["files"][codec_name]:
1163                     files['hd'].append((codec_name, codec_extension, 'hd'))
1164                 elif 'sd' in config["video"]["files"][codec_name]:
1165                     files['sd'].append((codec_name, codec_extension, 'sd'))
1166                 else:
1167                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1168
1169         for quality in ('hd', 'sd', 'other'):
1170             if len(files[quality]) > 0:
1171                 video_quality = files[quality][0][2]
1172                 video_codec = files[quality][0][0]
1173                 video_extension = files[quality][0][1]
1174                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1175                 break
1176         else:
1177             raise ExtractorError(u'No known codec found')
1178
1179         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1180                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1181
1182         return [{
1183             'id':       video_id,
1184             'url':      video_url,
1185             'uploader': video_uploader,
1186             'uploader_id': video_uploader_id,
1187             'upload_date':  video_upload_date,
1188             'title':    video_title,
1189             'ext':      video_extension,
1190             'thumbnail':    video_thumbnail,
1191             'description':  video_description,
1192         }]
1193
1194
1195 class ArteTvIE(InfoExtractor):
1196     """arte.tv information extractor."""
1197
1198     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1199     _LIVE_URL = r'index-[0-9]+\.html$'
1200
1201     IE_NAME = u'arte.tv'
1202
1203     def fetch_webpage(self, url):
1204         request = compat_urllib_request.Request(url)
1205         try:
1206             self.report_download_webpage(url)
1207             webpage = compat_urllib_request.urlopen(request).read()
1208         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1209             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1210         except ValueError as err:
1211             raise ExtractorError(u'Invalid URL: %s' % url)
1212         return webpage
1213
1214     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1215         page = self.fetch_webpage(url)
1216         mobj = re.search(regex, page, regexFlags)
1217         info = {}
1218
1219         if mobj is None:
1220             raise ExtractorError(u'Invalid URL: %s' % url)
1221
1222         for (i, key, err) in matchTuples:
1223             if mobj.group(i) is None:
1224                 raise ExtractorError(err)
1225             else:
1226                 info[key] = mobj.group(i)
1227
1228         return info
1229
1230     def extractLiveStream(self, url):
1231         video_lang = url.split('/')[-4]
1232         info = self.grep_webpage(
1233             url,
1234             r'src="(.*?/videothek_js.*?\.js)',
1235             0,
1236             [
1237                 (1, 'url', u'Invalid URL: %s' % url)
1238             ]
1239         )
1240         http_host = url.split('/')[2]
1241         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1242         info = self.grep_webpage(
1243             next_url,
1244             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1245                 '(http://.*?\.swf).*?' +
1246                 '(rtmp://.*?)\'',
1247             re.DOTALL,
1248             [
1249                 (1, 'path',   u'could not extract video path: %s' % url),
1250                 (2, 'player', u'could not extract video player: %s' % url),
1251                 (3, 'url',    u'could not extract video url: %s' % url)
1252             ]
1253         )
1254         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1255
1256     def extractPlus7Stream(self, url):
1257         video_lang = url.split('/')[-3]
1258         info = self.grep_webpage(
1259             url,
1260             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1261             0,
1262             [
1263                 (1, 'url', u'Invalid URL: %s' % url)
1264             ]
1265         )
1266         next_url = compat_urllib_parse.unquote(info.get('url'))
1267         info = self.grep_webpage(
1268             next_url,
1269             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1270             0,
1271             [
1272                 (1, 'url', u'Could not find <video> tag: %s' % url)
1273             ]
1274         )
1275         next_url = compat_urllib_parse.unquote(info.get('url'))
1276
1277         info = self.grep_webpage(
1278             next_url,
1279             r'<video id="(.*?)".*?>.*?' +
1280                 '<name>(.*?)</name>.*?' +
1281                 '<dateVideo>(.*?)</dateVideo>.*?' +
1282                 '<url quality="hd">(.*?)</url>',
1283             re.DOTALL,
1284             [
1285                 (1, 'id',    u'could not extract video id: %s' % url),
1286                 (2, 'title', u'could not extract video title: %s' % url),
1287                 (3, 'date',  u'could not extract video date: %s' % url),
1288                 (4, 'url',   u'could not extract video url: %s' % url)
1289             ]
1290         )
1291
1292         return {
1293             'id':           info.get('id'),
1294             'url':          compat_urllib_parse.unquote(info.get('url')),
1295             'uploader':     u'arte.tv',
1296             'upload_date':  unified_strdate(info.get('date')),
1297             'title':        info.get('title').decode('utf-8'),
1298             'ext':          u'mp4',
1299             'format':       u'NA',
1300             'player_url':   None,
1301         }
1302
1303     def _real_extract(self, url):
1304         video_id = url.split('/')[-1]
1305         self.report_extraction(video_id)
1306
1307         if re.search(self._LIVE_URL, video_id) is not None:
1308             self.extractLiveStream(url)
1309             return
1310         else:
1311             info = self.extractPlus7Stream(url)
1312
1313         return [info]
1314
1315
1316 class GenericIE(InfoExtractor):
1317     """Generic last-resort information extractor."""
1318
1319     _VALID_URL = r'.*'
1320     IE_NAME = u'generic'
1321
1322     def report_download_webpage(self, video_id):
1323         """Report webpage download."""
1324         if not self._downloader.params.get('test', False):
1325             self._downloader.report_warning(u'Falling back on generic information extractor.')
1326         super(GenericIE, self).report_download_webpage(video_id)
1327
1328     def report_following_redirect(self, new_url):
1329         """Report information extraction."""
1330         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1331
1332     def _test_redirect(self, url):
1333         """Check if it is a redirect, like url shorteners, in case return the new url."""
1334         class HeadRequest(compat_urllib_request.Request):
1335             def get_method(self):
1336                 return "HEAD"
1337
1338         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1339             """
1340             Subclass the HTTPRedirectHandler to make it use our
1341             HeadRequest also on the redirected URL
1342             """
1343             def redirect_request(self, req, fp, code, msg, headers, newurl):
1344                 if code in (301, 302, 303, 307):
1345                     newurl = newurl.replace(' ', '%20')
1346                     newheaders = dict((k,v) for k,v in req.headers.items()
1347                                       if k.lower() not in ("content-length", "content-type"))
1348                     return HeadRequest(newurl,
1349                                        headers=newheaders,
1350                                        origin_req_host=req.get_origin_req_host(),
1351                                        unverifiable=True)
1352                 else:
1353                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1354
1355         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1356             """
1357             Fallback to GET if HEAD is not allowed (405 HTTP error)
1358             """
1359             def http_error_405(self, req, fp, code, msg, headers):
1360                 fp.read()
1361                 fp.close()
1362
1363                 newheaders = dict((k,v) for k,v in req.headers.items()
1364                                   if k.lower() not in ("content-length", "content-type"))
1365                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1366                                                  headers=newheaders,
1367                                                  origin_req_host=req.get_origin_req_host(),
1368                                                  unverifiable=True))
1369
1370         # Build our opener
1371         opener = compat_urllib_request.OpenerDirector()
1372         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1373                         HTTPMethodFallback, HEADRedirectHandler,
1374                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1375             opener.add_handler(handler())
1376
1377         response = opener.open(HeadRequest(url))
1378         if response is None:
1379             raise ExtractorError(u'Invalid URL protocol')
1380         new_url = response.geturl()
1381
1382         if url == new_url:
1383             return False
1384
1385         self.report_following_redirect(new_url)
1386         return new_url
1387
1388     def _real_extract(self, url):
1389         new_url = self._test_redirect(url)
1390         if new_url: return [self.url_result(new_url)]
1391
1392         video_id = url.split('/')[-1]
1393         try:
1394             webpage = self._download_webpage(url, video_id)
1395         except ValueError as err:
1396             # since this is the last-resort InfoExtractor, if
1397             # this error is thrown, it'll be thrown here
1398             raise ExtractorError(u'Invalid URL: %s' % url)
1399
1400         self.report_extraction(video_id)
1401         # Start with something easy: JW Player in SWFObject
1402         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1403         if mobj is None:
1404             # Broaden the search a little bit
1405             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1406         if mobj is None:
1407             # Broaden the search a little bit: JWPlayer JS loader
1408             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1409         if mobj is None:
1410             # Try to find twitter cards info
1411             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1412         if mobj is None:
1413             raise ExtractorError(u'Invalid URL: %s' % url)
1414
1415         # It's possible that one of the regexes
1416         # matched, but returned an empty group:
1417         if mobj.group(1) is None:
1418             raise ExtractorError(u'Invalid URL: %s' % url)
1419
1420         video_url = compat_urllib_parse.unquote(mobj.group(1))
1421         video_id = os.path.basename(video_url)
1422
1423         # here's a fun little line of code for you:
1424         video_extension = os.path.splitext(video_id)[1][1:]
1425         video_id = os.path.splitext(video_id)[0]
1426
1427         # it's tempting to parse this further, but you would
1428         # have to take into account all the variations like
1429         #   Video Title - Site Name
1430         #   Site Name | Video Title
1431         #   Video Title - Tagline | Site Name
1432         # and so on and so forth; it's just not practical
1433         video_title = self._html_search_regex(r'<title>(.*)</title>',
1434             webpage, u'video title')
1435
1436         # video uploader is domain name
1437         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1438             url, u'video uploader')
1439
1440         return [{
1441             'id':       video_id,
1442             'url':      video_url,
1443             'uploader': video_uploader,
1444             'upload_date':  None,
1445             'title':    video_title,
1446             'ext':      video_extension,
1447         }]
1448
1449
1450 class YoutubeSearchIE(SearchInfoExtractor):
1451     """Information Extractor for YouTube search queries."""
1452     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1453     _MAX_RESULTS = 1000
1454     IE_NAME = u'youtube:search'
1455     _SEARCH_KEY = 'ytsearch'
1456
1457     def report_download_page(self, query, pagenum):
1458         """Report attempt to download search page with given number."""
1459         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1460
1461     def _get_n_results(self, query, n):
1462         """Get a specified number of results for a query"""
1463
1464         video_ids = []
1465         pagenum = 0
1466         limit = n
1467
1468         while (50 * pagenum) < limit:
1469             self.report_download_page(query, pagenum+1)
1470             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1471             request = compat_urllib_request.Request(result_url)
1472             try:
1473                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1474             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1475                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1476             api_response = json.loads(data)['data']
1477
1478             if not 'items' in api_response:
1479                 raise ExtractorError(u'[youtube] No video results')
1480
1481             new_ids = list(video['id'] for video in api_response['items'])
1482             video_ids += new_ids
1483
1484             limit = min(n, api_response['totalItems'])
1485             pagenum += 1
1486
1487         if len(video_ids) > n:
1488             video_ids = video_ids[:n]
1489         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1490         return self.playlist_result(videos, query)
1491
1492
1493 class GoogleSearchIE(SearchInfoExtractor):
1494     """Information Extractor for Google Video search queries."""
1495     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1496     _MAX_RESULTS = 1000
1497     IE_NAME = u'video.google:search'
1498     _SEARCH_KEY = 'gvsearch'
1499
1500     def _get_n_results(self, query, n):
1501         """Get a specified number of results for a query"""
1502
1503         res = {
1504             '_type': 'playlist',
1505             'id': query,
1506             'entries': []
1507         }
1508
1509         for pagenum in itertools.count(1):
1510             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1511             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1512                                              note='Downloading result page ' + str(pagenum))
1513
1514             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1515                 e = {
1516                     '_type': 'url',
1517                     'url': mobj.group(1)
1518                 }
1519                 res['entries'].append(e)
1520
1521             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1522                 return res
1523
1524 class YahooSearchIE(SearchInfoExtractor):
1525     """Information Extractor for Yahoo! Video search queries."""
1526
1527     _MAX_RESULTS = 1000
1528     IE_NAME = u'screen.yahoo:search'
1529     _SEARCH_KEY = 'yvsearch'
1530
1531     def _get_n_results(self, query, n):
1532         """Get a specified number of results for a query"""
1533
1534         res = {
1535             '_type': 'playlist',
1536             'id': query,
1537             'entries': []
1538         }
1539         for pagenum in itertools.count(0):
1540             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1541             webpage = self._download_webpage(result_url, query,
1542                                              note='Downloading results page '+str(pagenum+1))
1543             info = json.loads(webpage)
1544             m = info[u'm']
1545             results = info[u'results']
1546
1547             for (i, r) in enumerate(results):
1548                 if (pagenum * 30) +i >= n:
1549                     break
1550                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1551                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1552                 res['entries'].append(e)
1553             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1554                 break
1555
1556         return res
1557
1558
1559 class YoutubePlaylistIE(InfoExtractor):
1560     """Information Extractor for YouTube playlists."""
1561
1562     _VALID_URL = r"""(?:
1563                         (?:https?://)?
1564                         (?:\w+\.)?
1565                         youtube\.com/
1566                         (?:
1567                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1568                            \? (?:.*?&)*? (?:p|a|list)=
1569                         |  p/
1570                         )
1571                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1572                         .*
1573                      |
1574                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1575                      )"""
1576     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1577     _MAX_RESULTS = 50
1578     IE_NAME = u'youtube:playlist'
1579
1580     @classmethod
1581     def suitable(cls, url):
1582         """Receives a URL and returns True if suitable for this IE."""
1583         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1584
1585     def _real_extract(self, url):
1586         # Extract playlist id
1587         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1588         if mobj is None:
1589             raise ExtractorError(u'Invalid URL: %s' % url)
1590
1591         # Download playlist videos from API
1592         playlist_id = mobj.group(1) or mobj.group(2)
1593         page_num = 1
1594         videos = []
1595
1596         while True:
1597             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1598             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1599
1600             try:
1601                 response = json.loads(page)
1602             except ValueError as err:
1603                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1604
1605             if 'feed' not in response:
1606                 raise ExtractorError(u'Got a malformed response from YouTube API')
1607             playlist_title = response['feed']['title']['$t']
1608             if 'entry' not in response['feed']:
1609                 # Number of videos is a multiple of self._MAX_RESULTS
1610                 break
1611
1612             for entry in response['feed']['entry']:
1613                 index = entry['yt$position']['$t']
1614                 if 'media$group' in entry and 'media$player' in entry['media$group']:
1615                     videos.append((index, entry['media$group']['media$player']['url']))
1616
1617             if len(response['feed']['entry']) < self._MAX_RESULTS:
1618                 break
1619             page_num += 1
1620
1621         videos = [v[1] for v in sorted(videos)]
1622
1623         url_results = [self.url_result(url, 'Youtube') for url in videos]
1624         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1625
1626
1627 class YoutubeChannelIE(InfoExtractor):
1628     """Information Extractor for YouTube channels."""
1629
1630     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1631     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1632     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1633     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1634     IE_NAME = u'youtube:channel'
1635
1636     def extract_videos_from_page(self, page):
1637         ids_in_page = []
1638         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1639             if mobj.group(1) not in ids_in_page:
1640                 ids_in_page.append(mobj.group(1))
1641         return ids_in_page
1642
1643     def _real_extract(self, url):
1644         # Extract channel id
1645         mobj = re.match(self._VALID_URL, url)
1646         if mobj is None:
1647             raise ExtractorError(u'Invalid URL: %s' % url)
1648
1649         # Download channel page
1650         channel_id = mobj.group(1)
1651         video_ids = []
1652         pagenum = 1
1653
1654         url = self._TEMPLATE_URL % (channel_id, pagenum)
1655         page = self._download_webpage(url, channel_id,
1656                                       u'Downloading page #%s' % pagenum)
1657
1658         # Extract video identifiers
1659         ids_in_page = self.extract_videos_from_page(page)
1660         video_ids.extend(ids_in_page)
1661
1662         # Download any subsequent channel pages using the json-based channel_ajax query
1663         if self._MORE_PAGES_INDICATOR in page:
1664             while True:
1665                 pagenum = pagenum + 1
1666
1667                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1668                 page = self._download_webpage(url, channel_id,
1669                                               u'Downloading page #%s' % pagenum)
1670
1671                 page = json.loads(page)
1672
1673                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1674                 video_ids.extend(ids_in_page)
1675
1676                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1677                     break
1678
1679         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1680
1681         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1682         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1683         return [self.playlist_result(url_entries, channel_id)]
1684
1685
1686 class YoutubeUserIE(InfoExtractor):
1687     """Information Extractor for YouTube users."""
1688
1689     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1690     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1691     _GDATA_PAGE_SIZE = 50
1692     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1693     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1694     IE_NAME = u'youtube:user'
1695
1696     def _real_extract(self, url):
1697         # Extract username
1698         mobj = re.match(self._VALID_URL, url)
1699         if mobj is None:
1700             raise ExtractorError(u'Invalid URL: %s' % url)
1701
1702         username = mobj.group(1)
1703
1704         # Download video ids using YouTube Data API. Result size per
1705         # query is limited (currently to 50 videos) so we need to query
1706         # page by page until there are no video ids - it means we got
1707         # all of them.
1708
1709         video_ids = []
1710         pagenum = 0
1711
1712         while True:
1713             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1714
1715             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1716             page = self._download_webpage(gdata_url, username,
1717                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1718
1719             # Extract video identifiers
1720             ids_in_page = []
1721
1722             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1723                 if mobj.group(1) not in ids_in_page:
1724                     ids_in_page.append(mobj.group(1))
1725
1726             video_ids.extend(ids_in_page)
1727
1728             # A little optimization - if current page is not
1729             # "full", ie. does not contain PAGE_SIZE video ids then
1730             # we can assume that this page is the last one - there
1731             # are no more ids on further pages - no need to query
1732             # again.
1733
1734             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1735                 break
1736
1737             pagenum += 1
1738
1739         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1740         url_results = [self.url_result(url, 'Youtube') for url in urls]
1741         return [self.playlist_result(url_results, playlist_title = username)]
1742
1743
1744 class BlipTVUserIE(InfoExtractor):
1745     """Information Extractor for blip.tv users."""
1746
1747     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1748     _PAGE_SIZE = 12
1749     IE_NAME = u'blip.tv:user'
1750
1751     def _real_extract(self, url):
1752         # Extract username
1753         mobj = re.match(self._VALID_URL, url)
1754         if mobj is None:
1755             raise ExtractorError(u'Invalid URL: %s' % url)
1756
1757         username = mobj.group(1)
1758
1759         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1760
1761         page = self._download_webpage(url, username, u'Downloading user page')
1762         mobj = re.search(r'data-users-id="([^"]+)"', page)
1763         page_base = page_base % mobj.group(1)
1764
1765
1766         # Download video ids using BlipTV Ajax calls. Result size per
1767         # query is limited (currently to 12 videos) so we need to query
1768         # page by page until there are no video ids - it means we got
1769         # all of them.
1770
1771         video_ids = []
1772         pagenum = 1
1773
1774         while True:
1775             url = page_base + "&page=" + str(pagenum)
1776             page = self._download_webpage(url, username,
1777                                           u'Downloading video ids from page %d' % pagenum)
1778
1779             # Extract video identifiers
1780             ids_in_page = []
1781
1782             for mobj in re.finditer(r'href="/([^"]+)"', page):
1783                 if mobj.group(1) not in ids_in_page:
1784                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1785
1786             video_ids.extend(ids_in_page)
1787
1788             # A little optimization - if current page is not
1789             # "full", ie. does not contain PAGE_SIZE video ids then
1790             # we can assume that this page is the last one - there
1791             # are no more ids on further pages - no need to query
1792             # again.
1793
1794             if len(ids_in_page) < self._PAGE_SIZE:
1795                 break
1796
1797             pagenum += 1
1798
1799         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1800         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1801         return [self.playlist_result(url_entries, playlist_title = username)]
1802
1803
1804 class DepositFilesIE(InfoExtractor):
1805     """Information extractor for depositfiles.com"""
1806
1807     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1808
1809     def _real_extract(self, url):
1810         file_id = url.split('/')[-1]
1811         # Rebuild url in english locale
1812         url = 'http://depositfiles.com/en/files/' + file_id
1813
1814         # Retrieve file webpage with 'Free download' button pressed
1815         free_download_indication = { 'gateway_result' : '1' }
1816         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1817         try:
1818             self.report_download_webpage(file_id)
1819             webpage = compat_urllib_request.urlopen(request).read()
1820         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1821             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1822
1823         # Search for the real file URL
1824         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1825         if (mobj is None) or (mobj.group(1) is None):
1826             # Try to figure out reason of the error.
1827             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1828             if (mobj is not None) and (mobj.group(1) is not None):
1829                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1830                 raise ExtractorError(u'%s' % restriction_message)
1831             else:
1832                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1833
1834         file_url = mobj.group(1)
1835         file_extension = os.path.splitext(file_url)[1][1:]
1836
1837         # Search for file title
1838         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1839
1840         return [{
1841             'id':       file_id.decode('utf-8'),
1842             'url':      file_url.decode('utf-8'),
1843             'uploader': None,
1844             'upload_date':  None,
1845             'title':    file_title,
1846             'ext':      file_extension.decode('utf-8'),
1847         }]
1848
1849
1850 class FacebookIE(InfoExtractor):
1851     """Information Extractor for Facebook"""
1852
1853     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1854     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1855     _NETRC_MACHINE = 'facebook'
1856     IE_NAME = u'facebook'
1857
1858     def report_login(self):
1859         """Report attempt to log in."""
1860         self.to_screen(u'Logging in')
1861
1862     def _real_initialize(self):
1863         if self._downloader is None:
1864             return
1865
1866         useremail = None
1867         password = None
1868         downloader_params = self._downloader.params
1869
1870         # Attempt to use provided username and password or .netrc data
1871         if downloader_params.get('username', None) is not None:
1872             useremail = downloader_params['username']
1873             password = downloader_params['password']
1874         elif downloader_params.get('usenetrc', False):
1875             try:
1876                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1877                 if info is not None:
1878                     useremail = info[0]
1879                     password = info[2]
1880                 else:
1881                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1882             except (IOError, netrc.NetrcParseError) as err:
1883                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1884                 return
1885
1886         if useremail is None:
1887             return
1888
1889         # Log in
1890         login_form = {
1891             'email': useremail,
1892             'pass': password,
1893             'login': 'Log+In'
1894             }
1895         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1896         try:
1897             self.report_login()
1898             login_results = compat_urllib_request.urlopen(request).read()
1899             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1900                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1901                 return
1902         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1903             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1904             return
1905
1906     def _real_extract(self, url):
1907         mobj = re.match(self._VALID_URL, url)
1908         if mobj is None:
1909             raise ExtractorError(u'Invalid URL: %s' % url)
1910         video_id = mobj.group('ID')
1911
1912         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1913         webpage = self._download_webpage(url, video_id)
1914
1915         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1916         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1917         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1918         if not m:
1919             raise ExtractorError(u'Cannot parse data')
1920         data = dict(json.loads(m.group(1)))
1921         params_raw = compat_urllib_parse.unquote(data['params'])
1922         params = json.loads(params_raw)
1923         video_data = params['video_data'][0]
1924         video_url = video_data.get('hd_src')
1925         if not video_url:
1926             video_url = video_data['sd_src']
1927         if not video_url:
1928             raise ExtractorError(u'Cannot find video URL')
1929         video_duration = int(video_data['video_duration'])
1930         thumbnail = video_data['thumbnail_src']
1931
1932         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1933             webpage, u'title')
1934
1935         info = {
1936             'id': video_id,
1937             'title': video_title,
1938             'url': video_url,
1939             'ext': 'mp4',
1940             'duration': video_duration,
1941             'thumbnail': thumbnail,
1942         }
1943         return [info]
1944
1945
1946 class BlipTVIE(InfoExtractor):
1947     """Information extractor for blip.tv"""
1948
1949     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1950     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1951     IE_NAME = u'blip.tv'
1952
1953     def report_direct_download(self, title):
1954         """Report information extraction."""
1955         self.to_screen(u'%s: Direct download detected' % title)
1956
1957     def _real_extract(self, url):
1958         mobj = re.match(self._VALID_URL, url)
1959         if mobj is None:
1960             raise ExtractorError(u'Invalid URL: %s' % url)
1961
1962         # See https://github.com/rg3/youtube-dl/issues/857
1963         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1964         if api_mobj is not None:
1965             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1966         urlp = compat_urllib_parse_urlparse(url)
1967         if urlp.path.startswith('/play/'):
1968             request = compat_urllib_request.Request(url)
1969             response = compat_urllib_request.urlopen(request)
1970             redirecturl = response.geturl()
1971             rurlp = compat_urllib_parse_urlparse(redirecturl)
1972             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1973             url = 'http://blip.tv/a/a-' + file_id
1974             return self._real_extract(url)
1975
1976
1977         if '?' in url:
1978             cchar = '&'
1979         else:
1980             cchar = '?'
1981         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1982         request = compat_urllib_request.Request(json_url)
1983         request.add_header('User-Agent', 'iTunes/10.6.1')
1984         self.report_extraction(mobj.group(1))
1985         info = None
1986         try:
1987             urlh = compat_urllib_request.urlopen(request)
1988             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1989                 basename = url.split('/')[-1]
1990                 title,ext = os.path.splitext(basename)
1991                 title = title.decode('UTF-8')
1992                 ext = ext.replace('.', '')
1993                 self.report_direct_download(title)
1994                 info = {
1995                     'id': title,
1996                     'url': url,
1997                     'uploader': None,
1998                     'upload_date': None,
1999                     'title': title,
2000                     'ext': ext,
2001                     'urlhandle': urlh
2002                 }
2003         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2004             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2005         if info is None: # Regular URL
2006             try:
2007                 json_code_bytes = urlh.read()
2008                 json_code = json_code_bytes.decode('utf-8')
2009             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2010                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2011
2012             try:
2013                 json_data = json.loads(json_code)
2014                 if 'Post' in json_data:
2015                     data = json_data['Post']
2016                 else:
2017                     data = json_data
2018
2019                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2020                 video_url = data['media']['url']
2021                 umobj = re.match(self._URL_EXT, video_url)
2022                 if umobj is None:
2023                     raise ValueError('Can not determine filename extension')
2024                 ext = umobj.group(1)
2025
2026                 info = {
2027                     'id': data['item_id'],
2028                     'url': video_url,
2029                     'uploader': data['display_name'],
2030                     'upload_date': upload_date,
2031                     'title': data['title'],
2032                     'ext': ext,
2033                     'format': data['media']['mimeType'],
2034                     'thumbnail': data['thumbnailUrl'],
2035                     'description': data['description'],
2036                     'player_url': data['embedUrl'],
2037                     'user_agent': 'iTunes/10.6.1',
2038                 }
2039             except (ValueError,KeyError) as err:
2040                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2041
2042         return [info]
2043
2044
2045 class MyVideoIE(InfoExtractor):
2046     """Information Extractor for myvideo.de."""
2047
2048     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2049     IE_NAME = u'myvideo'
2050
2051     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2052     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2053     # https://github.com/rg3/youtube-dl/pull/842
2054     def __rc4crypt(self,data, key):
2055         x = 0
2056         box = list(range(256))
2057         for i in list(range(256)):
2058             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2059             box[i], box[x] = box[x], box[i]
2060         x = 0
2061         y = 0
2062         out = ''
2063         for char in data:
2064             x = (x + 1) % 256
2065             y = (y + box[x]) % 256
2066             box[x], box[y] = box[y], box[x]
2067             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2068         return out
2069
2070     def __md5(self,s):
2071         return hashlib.md5(s).hexdigest().encode()
2072
2073     def _real_extract(self,url):
2074         mobj = re.match(self._VALID_URL, url)
2075         if mobj is None:
2076             raise ExtractorError(u'invalid URL: %s' % url)
2077
2078         video_id = mobj.group(1)
2079
2080         GK = (
2081           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2082           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2083           b'TnpsbA0KTVRkbU1tSTRNdz09'
2084         )
2085
2086         # Get video webpage
2087         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2088         webpage = self._download_webpage(webpage_url, video_id)
2089
2090         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2091         if mobj is not None:
2092             self.report_extraction(video_id)
2093             video_url = mobj.group(1) + '.flv'
2094
2095             video_title = self._html_search_regex('<title>([^<]+)</title>',
2096                 webpage, u'title')
2097
2098             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2099
2100             return [{
2101                 'id':       video_id,
2102                 'url':      video_url,
2103                 'uploader': None,
2104                 'upload_date':  None,
2105                 'title':    video_title,
2106                 'ext':      u'flv',
2107             }]
2108
2109         # try encxml
2110         mobj = re.search('var flashvars={(.+?)}', webpage)
2111         if mobj is None:
2112             raise ExtractorError(u'Unable to extract video')
2113
2114         params = {}
2115         encxml = ''
2116         sec = mobj.group(1)
2117         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2118             if not a == '_encxml':
2119                 params[a] = b
2120             else:
2121                 encxml = compat_urllib_parse.unquote(b)
2122         if not params.get('domain'):
2123             params['domain'] = 'www.myvideo.de'
2124         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2125         if 'flash_playertype=MTV' in xmldata_url:
2126             self._downloader.report_warning(u'avoiding MTV player')
2127             xmldata_url = (
2128                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2129                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2130             ) % video_id
2131
2132         # get enc data
2133         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2134         enc_data_b = binascii.unhexlify(enc_data)
2135         sk = self.__md5(
2136             base64.b64decode(base64.b64decode(GK)) +
2137             self.__md5(
2138                 str(video_id).encode('utf-8')
2139             )
2140         )
2141         dec_data = self.__rc4crypt(enc_data_b, sk)
2142
2143         # extracting infos
2144         self.report_extraction(video_id)
2145
2146         video_url = None
2147         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2148         if mobj:
2149             video_url = compat_urllib_parse.unquote(mobj.group(1))
2150             if 'myvideo2flash' in video_url:
2151                 self._downloader.report_warning(u'forcing RTMPT ...')
2152                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2153
2154         if not video_url:
2155             # extract non rtmp videos
2156             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2157             if mobj is None:
2158                 raise ExtractorError(u'unable to extract url')
2159             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2160
2161         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2162         video_file = compat_urllib_parse.unquote(video_file)
2163
2164         if not video_file.endswith('f4m'):
2165             ppath, prefix = video_file.split('.')
2166             video_playpath = '%s:%s' % (prefix, ppath)
2167             video_hls_playlist = ''
2168         else:
2169             video_playpath = ''
2170             video_hls_playlist = (
2171                 video_filepath + video_file
2172             ).replace('.f4m', '.m3u8')
2173
2174         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2175         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2176
2177         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2178             webpage, u'title')
2179
2180         return [{
2181             'id':                 video_id,
2182             'url':                video_url,
2183             'tc_url':             video_url,
2184             'uploader':           None,
2185             'upload_date':        None,
2186             'title':              video_title,
2187             'ext':                u'flv',
2188             'play_path':          video_playpath,
2189             'video_file':         video_file,
2190             'video_hls_playlist': video_hls_playlist,
2191             'player_url':         video_swfobj,
2192         }]
2193
2194
2195 class ComedyCentralIE(InfoExtractor):
2196     """Information extractor for The Daily Show and Colbert Report """
2197
2198     # urls can be abbreviations like :thedailyshow or :colbert
2199     # urls for episodes like:
2200     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2201     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2202     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2203     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2204                       |(https?://)?(www\.)?
2205                           (?P<showname>thedailyshow|colbertnation)\.com/
2206                          (full-episodes/(?P<episode>.*)|
2207                           (?P<clip>
2208                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2209                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2210                      $"""
2211
2212     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2213
2214     _video_extensions = {
2215         '3500': 'mp4',
2216         '2200': 'mp4',
2217         '1700': 'mp4',
2218         '1200': 'mp4',
2219         '750': 'mp4',
2220         '400': 'mp4',
2221     }
2222     _video_dimensions = {
2223         '3500': '1280x720',
2224         '2200': '960x540',
2225         '1700': '768x432',
2226         '1200': '640x360',
2227         '750': '512x288',
2228         '400': '384x216',
2229     }
2230
2231     @classmethod
2232     def suitable(cls, url):
2233         """Receives a URL and returns True if suitable for this IE."""
2234         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2235
2236     def _print_formats(self, formats):
2237         print('Available formats:')
2238         for x in formats:
2239             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2240
2241
2242     def _real_extract(self, url):
2243         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2244         if mobj is None:
2245             raise ExtractorError(u'Invalid URL: %s' % url)
2246
2247         if mobj.group('shortname'):
2248             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2249                 url = u'http://www.thedailyshow.com/full-episodes/'
2250             else:
2251                 url = u'http://www.colbertnation.com/full-episodes/'
2252             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2253             assert mobj is not None
2254
2255         if mobj.group('clip'):
2256             if mobj.group('showname') == 'thedailyshow':
2257                 epTitle = mobj.group('tdstitle')
2258             else:
2259                 epTitle = mobj.group('cntitle')
2260             dlNewest = False
2261         else:
2262             dlNewest = not mobj.group('episode')
2263             if dlNewest:
2264                 epTitle = mobj.group('showname')
2265             else:
2266                 epTitle = mobj.group('episode')
2267
2268         self.report_extraction(epTitle)
2269         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2270         if dlNewest:
2271             url = htmlHandle.geturl()
2272             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2273             if mobj is None:
2274                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2275             if mobj.group('episode') == '':
2276                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2277             epTitle = mobj.group('episode')
2278
2279         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2280
2281         if len(mMovieParams) == 0:
2282             # The Colbert Report embeds the information in a without
2283             # a URL prefix; so extract the alternate reference
2284             # and then add the URL prefix manually.
2285
2286             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2287             if len(altMovieParams) == 0:
2288                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2289             else:
2290                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2291
2292         uri = mMovieParams[0][1]
2293         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2294         indexXml = self._download_webpage(indexUrl, epTitle,
2295                                           u'Downloading show index',
2296                                           u'unable to download episode index')
2297
2298         results = []
2299
2300         idoc = xml.etree.ElementTree.fromstring(indexXml)
2301         itemEls = idoc.findall('.//item')
2302         for partNum,itemEl in enumerate(itemEls):
2303             mediaId = itemEl.findall('./guid')[0].text
2304             shortMediaId = mediaId.split(':')[-1]
2305             showId = mediaId.split(':')[-2].replace('.com', '')
2306             officialTitle = itemEl.findall('./title')[0].text
2307             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2308
2309             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2310                         compat_urllib_parse.urlencode({'uri': mediaId}))
2311             configXml = self._download_webpage(configUrl, epTitle,
2312                                                u'Downloading configuration for %s' % shortMediaId)
2313
2314             cdoc = xml.etree.ElementTree.fromstring(configXml)
2315             turls = []
2316             for rendition in cdoc.findall('.//rendition'):
2317                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2318                 turls.append(finfo)
2319
2320             if len(turls) == 0:
2321                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2322                 continue
2323
2324             if self._downloader.params.get('listformats', None):
2325                 self._print_formats([i[0] for i in turls])
2326                 return
2327
2328             # For now, just pick the highest bitrate
2329             format,rtmp_video_url = turls[-1]
2330
2331             # Get the format arg from the arg stream
2332             req_format = self._downloader.params.get('format', None)
2333
2334             # Select format if we can find one
2335             for f,v in turls:
2336                 if f == req_format:
2337                     format, rtmp_video_url = f, v
2338                     break
2339
2340             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2341             if not m:
2342                 raise ExtractorError(u'Cannot transform RTMP url')
2343             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2344             video_url = base + m.group('finalid')
2345
2346             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2347             info = {
2348                 'id': shortMediaId,
2349                 'url': video_url,
2350                 'uploader': showId,
2351                 'upload_date': officialDate,
2352                 'title': effTitle,
2353                 'ext': 'mp4',
2354                 'format': format,
2355                 'thumbnail': None,
2356                 'description': officialTitle,
2357             }
2358             results.append(info)
2359
2360         return results
2361
2362
2363 class EscapistIE(InfoExtractor):
2364     """Information extractor for The Escapist """
2365
2366     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2367     IE_NAME = u'escapist'
2368
2369     def _real_extract(self, url):
2370         mobj = re.match(self._VALID_URL, url)
2371         if mobj is None:
2372             raise ExtractorError(u'Invalid URL: %s' % url)
2373         showName = mobj.group('showname')
2374         videoId = mobj.group('episode')
2375
2376         self.report_extraction(videoId)
2377         webpage = self._download_webpage(url, videoId)
2378
2379         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2380             webpage, u'description', fatal=False)
2381
2382         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2383             webpage, u'thumbnail', fatal=False)
2384
2385         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2386             webpage, u'player url')
2387
2388         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2389             webpage, u'player url').split(' : ')[-1]
2390
2391         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2392         configUrl = compat_urllib_parse.unquote(configUrl)
2393
2394         configJSON = self._download_webpage(configUrl, videoId,
2395                                             u'Downloading configuration',
2396                                             u'unable to download configuration')
2397
2398         # Technically, it's JavaScript, not JSON
2399         configJSON = configJSON.replace("'", '"')
2400
2401         try:
2402             config = json.loads(configJSON)
2403         except (ValueError,) as err:
2404             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2405
2406         playlist = config['playlist']
2407         videoUrl = playlist[1]['url']
2408
2409         info = {
2410             'id': videoId,
2411             'url': videoUrl,
2412             'uploader': showName,
2413             'upload_date': None,
2414             'title': title,
2415             'ext': 'mp4',
2416             'thumbnail': imgUrl,
2417             'description': videoDesc,
2418             'player_url': playerUrl,
2419         }
2420
2421         return [info]
2422
2423 class CollegeHumorIE(InfoExtractor):
2424     """Information extractor for collegehumor.com"""
2425
2426     _WORKING = False
2427     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2428     IE_NAME = u'collegehumor'
2429
2430     def report_manifest(self, video_id):
2431         """Report information extraction."""
2432         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2433
2434     def _real_extract(self, url):
2435         mobj = re.match(self._VALID_URL, url)
2436         if mobj is None:
2437             raise ExtractorError(u'Invalid URL: %s' % url)
2438         video_id = mobj.group('videoid')
2439
2440         info = {
2441             'id': video_id,
2442             'uploader': None,
2443             'upload_date': None,
2444         }
2445
2446         self.report_extraction(video_id)
2447         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2448         try:
2449             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2450         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2451             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2452
2453         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2454         try:
2455             videoNode = mdoc.findall('./video')[0]
2456             info['description'] = videoNode.findall('./description')[0].text
2457             info['title'] = videoNode.findall('./caption')[0].text
2458             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2459             manifest_url = videoNode.findall('./file')[0].text
2460         except IndexError:
2461             raise ExtractorError(u'Invalid metadata XML file')
2462
2463         manifest_url += '?hdcore=2.10.3'
2464         self.report_manifest(video_id)
2465         try:
2466             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2467         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2468             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2469
2470         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2471         try:
2472             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2473             node_id = media_node.attrib['url']
2474             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2475         except IndexError as err:
2476             raise ExtractorError(u'Invalid manifest file')
2477
2478         url_pr = compat_urllib_parse_urlparse(manifest_url)
2479         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2480
2481         info['url'] = url
2482         info['ext'] = 'f4f'
2483         return [info]
2484
2485
2486 class XVideosIE(InfoExtractor):
2487     """Information extractor for xvideos.com"""
2488
2489     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2490     IE_NAME = u'xvideos'
2491
2492     def _real_extract(self, url):
2493         mobj = re.match(self._VALID_URL, url)
2494         if mobj is None:
2495             raise ExtractorError(u'Invalid URL: %s' % url)
2496         video_id = mobj.group(1)
2497
2498         webpage = self._download_webpage(url, video_id)
2499
2500         self.report_extraction(video_id)
2501
2502         # Extract video URL
2503         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2504             webpage, u'video URL'))
2505
2506         # Extract title
2507         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2508             webpage, u'title')
2509
2510         # Extract video thumbnail
2511         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2512             webpage, u'thumbnail', fatal=False)
2513
2514         info = {
2515             'id': video_id,
2516             'url': video_url,
2517             'uploader': None,
2518             'upload_date': None,
2519             'title': video_title,
2520             'ext': 'flv',
2521             'thumbnail': video_thumbnail,
2522             'description': None,
2523         }
2524
2525         return [info]
2526
2527
2528 class SoundcloudIE(InfoExtractor):
2529     """Information extractor for soundcloud.com
2530        To access the media, the uid of the song and a stream token
2531        must be extracted from the page source and the script must make
2532        a request to media.soundcloud.com/crossdomain.xml. Then
2533        the media can be grabbed by requesting from an url composed
2534        of the stream token and uid
2535      """
2536
2537     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2538     IE_NAME = u'soundcloud'
2539
2540     def report_resolve(self, video_id):
2541         """Report information extraction."""
2542         self.to_screen(u'%s: Resolving id' % video_id)
2543
2544     def _real_extract(self, url):
2545         mobj = re.match(self._VALID_URL, url)
2546         if mobj is None:
2547             raise ExtractorError(u'Invalid URL: %s' % url)
2548
2549         # extract uploader (which is in the url)
2550         uploader = mobj.group(1)
2551         # extract simple title (uploader + slug of song title)
2552         slug_title =  mobj.group(2)
2553         simple_title = uploader + u'-' + slug_title
2554         full_title = '%s/%s' % (uploader, slug_title)
2555
2556         self.report_resolve(full_title)
2557
2558         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2559         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2560         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2561
2562         info = json.loads(info_json)
2563         video_id = info['id']
2564         self.report_extraction(full_title)
2565
2566         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2567         stream_json = self._download_webpage(streams_url, full_title,
2568                                              u'Downloading stream definitions',
2569                                              u'unable to download stream definitions')
2570
2571         streams = json.loads(stream_json)
2572         mediaURL = streams['http_mp3_128_url']
2573         upload_date = unified_strdate(info['created_at'])
2574
2575         return [{
2576             'id':       info['id'],
2577             'url':      mediaURL,
2578             'uploader': info['user']['username'],
2579             'upload_date': upload_date,
2580             'title':    info['title'],
2581             'ext':      u'mp3',
2582             'description': info['description'],
2583         }]
2584
2585 class SoundcloudSetIE(InfoExtractor):
2586     """Information extractor for soundcloud.com sets
2587        To access the media, the uid of the song and a stream token
2588        must be extracted from the page source and the script must make
2589        a request to media.soundcloud.com/crossdomain.xml. Then
2590        the media can be grabbed by requesting from an url composed
2591        of the stream token and uid
2592      """
2593
2594     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2595     IE_NAME = u'soundcloud:set'
2596
2597     def report_resolve(self, video_id):
2598         """Report information extraction."""
2599         self.to_screen(u'%s: Resolving id' % video_id)
2600
2601     def _real_extract(self, url):
2602         mobj = re.match(self._VALID_URL, url)
2603         if mobj is None:
2604             raise ExtractorError(u'Invalid URL: %s' % url)
2605
2606         # extract uploader (which is in the url)
2607         uploader = mobj.group(1)
2608         # extract simple title (uploader + slug of song title)
2609         slug_title =  mobj.group(2)
2610         simple_title = uploader + u'-' + slug_title
2611         full_title = '%s/sets/%s' % (uploader, slug_title)
2612
2613         self.report_resolve(full_title)
2614
2615         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2616         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2617         info_json = self._download_webpage(resolv_url, full_title)
2618
2619         videos = []
2620         info = json.loads(info_json)
2621         if 'errors' in info:
2622             for err in info['errors']:
2623                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2624             return
2625
2626         self.report_extraction(full_title)
2627         for track in info['tracks']:
2628             video_id = track['id']
2629
2630             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2631             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2632
2633             self.report_extraction(video_id)
2634             streams = json.loads(stream_json)
2635             mediaURL = streams['http_mp3_128_url']
2636
2637             videos.append({
2638                 'id':       video_id,
2639                 'url':      mediaURL,
2640                 'uploader': track['user']['username'],
2641                 'upload_date':  unified_strdate(track['created_at']),
2642                 'title':    track['title'],
2643                 'ext':      u'mp3',
2644                 'description': track['description'],
2645             })
2646         return videos
2647
2648
2649 class InfoQIE(InfoExtractor):
2650     """Information extractor for infoq.com"""
2651     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2652
2653     def _real_extract(self, url):
2654         mobj = re.match(self._VALID_URL, url)
2655         if mobj is None:
2656             raise ExtractorError(u'Invalid URL: %s' % url)
2657
2658         webpage = self._download_webpage(url, video_id=url)
2659         self.report_extraction(url)
2660
2661         # Extract video URL
2662         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2663         if mobj is None:
2664             raise ExtractorError(u'Unable to extract video url')
2665         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2666         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2667
2668         # Extract title
2669         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2670             webpage, u'title')
2671
2672         # Extract description
2673         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2674             webpage, u'description', fatal=False)
2675
2676         video_filename = video_url.split('/')[-1]
2677         video_id, extension = video_filename.split('.')
2678
2679         info = {
2680             'id': video_id,
2681             'url': video_url,
2682             'uploader': None,
2683             'upload_date': None,
2684             'title': video_title,
2685             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2686             'thumbnail': None,
2687             'description': video_description,
2688         }
2689
2690         return [info]
2691
2692 class MixcloudIE(InfoExtractor):
2693     """Information extractor for www.mixcloud.com"""
2694
2695     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2696     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2697     IE_NAME = u'mixcloud'
2698
2699     def report_download_json(self, file_id):
2700         """Report JSON download."""
2701         self.to_screen(u'Downloading json')
2702
2703     def get_urls(self, jsonData, fmt, bitrate='best'):
2704         """Get urls from 'audio_formats' section in json"""
2705         file_url = None
2706         try:
2707             bitrate_list = jsonData[fmt]
2708             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2709                 bitrate = max(bitrate_list) # select highest
2710
2711             url_list = jsonData[fmt][bitrate]
2712         except TypeError: # we have no bitrate info.
2713             url_list = jsonData[fmt]
2714         return url_list
2715
2716     def check_urls(self, url_list):
2717         """Returns 1st active url from list"""
2718         for url in url_list:
2719             try:
2720                 compat_urllib_request.urlopen(url)
2721                 return url
2722             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2723                 url = None
2724
2725         return None
2726
2727     def _print_formats(self, formats):
2728         print('Available formats:')
2729         for fmt in formats.keys():
2730             for b in formats[fmt]:
2731                 try:
2732                     ext = formats[fmt][b][0]
2733                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2734                 except TypeError: # we have no bitrate info
2735                     ext = formats[fmt][0]
2736                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2737                     break
2738
2739     def _real_extract(self, url):
2740         mobj = re.match(self._VALID_URL, url)
2741         if mobj is None:
2742             raise ExtractorError(u'Invalid URL: %s' % url)
2743         # extract uploader & filename from url
2744         uploader = mobj.group(1).decode('utf-8')
2745         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2746
2747         # construct API request
2748         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2749         # retrieve .json file with links to files
2750         request = compat_urllib_request.Request(file_url)
2751         try:
2752             self.report_download_json(file_url)
2753             jsonData = compat_urllib_request.urlopen(request).read()
2754         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2755             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2756
2757         # parse JSON
2758         json_data = json.loads(jsonData)
2759         player_url = json_data['player_swf_url']
2760         formats = dict(json_data['audio_formats'])
2761
2762         req_format = self._downloader.params.get('format', None)
2763         bitrate = None
2764
2765         if self._downloader.params.get('listformats', None):
2766             self._print_formats(formats)
2767             return
2768
2769         if req_format is None or req_format == 'best':
2770             for format_param in formats.keys():
2771                 url_list = self.get_urls(formats, format_param)
2772                 # check urls
2773                 file_url = self.check_urls(url_list)
2774                 if file_url is not None:
2775                     break # got it!
2776         else:
2777             if req_format not in formats:
2778                 raise ExtractorError(u'Format is not available')
2779
2780             url_list = self.get_urls(formats, req_format)
2781             file_url = self.check_urls(url_list)
2782             format_param = req_format
2783
2784         return [{
2785             'id': file_id.decode('utf-8'),
2786             'url': file_url.decode('utf-8'),
2787             'uploader': uploader.decode('utf-8'),
2788             'upload_date': None,
2789             'title': json_data['name'],
2790             'ext': file_url.split('.')[-1].decode('utf-8'),
2791             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2792             'thumbnail': json_data['thumbnail_url'],
2793             'description': json_data['description'],
2794             'player_url': player_url.decode('utf-8'),
2795         }]
2796
2797 class StanfordOpenClassroomIE(InfoExtractor):
2798     """Information extractor for Stanford's Open ClassRoom"""
2799
2800     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2801     IE_NAME = u'stanfordoc'
2802
2803     def _real_extract(self, url):
2804         mobj = re.match(self._VALID_URL, url)
2805         if mobj is None:
2806             raise ExtractorError(u'Invalid URL: %s' % url)
2807
2808         if mobj.group('course') and mobj.group('video'): # A specific video
2809             course = mobj.group('course')
2810             video = mobj.group('video')
2811             info = {
2812                 'id': course + '_' + video,
2813                 'uploader': None,
2814                 'upload_date': None,
2815             }
2816
2817             self.report_extraction(info['id'])
2818             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2819             xmlUrl = baseUrl + video + '.xml'
2820             try:
2821                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2822             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2823                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2824             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2825             try:
2826                 info['title'] = mdoc.findall('./title')[0].text
2827                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2828             except IndexError:
2829                 raise ExtractorError(u'Invalid metadata XML file')
2830             info['ext'] = info['url'].rpartition('.')[2]
2831             return [info]
2832         elif mobj.group('course'): # A course page
2833             course = mobj.group('course')
2834             info = {
2835                 'id': course,
2836                 'type': 'playlist',
2837                 'uploader': None,
2838                 'upload_date': None,
2839             }
2840
2841             coursepage = self._download_webpage(url, info['id'],
2842                                         note='Downloading course info page',
2843                                         errnote='Unable to download course info page')
2844
2845             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2846
2847             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2848                 coursepage, u'description', fatal=False)
2849
2850             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2851             info['list'] = [
2852                 {
2853                     'type': 'reference',
2854                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2855                 }
2856                     for vpage in links]
2857             results = []
2858             for entry in info['list']:
2859                 assert entry['type'] == 'reference'
2860                 results += self.extract(entry['url'])
2861             return results
2862         else: # Root page
2863             info = {
2864                 'id': 'Stanford OpenClassroom',
2865                 'type': 'playlist',
2866                 'uploader': None,
2867                 'upload_date': None,
2868             }
2869
2870             self.report_download_webpage(info['id'])
2871             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2872             try:
2873                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2874             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2875                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2876
2877             info['title'] = info['id']
2878
2879             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2880             info['list'] = [
2881                 {
2882                     'type': 'reference',
2883                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2884                 }
2885                     for cpage in links]
2886
2887             results = []
2888             for entry in info['list']:
2889                 assert entry['type'] == 'reference'
2890                 results += self.extract(entry['url'])
2891             return results
2892
2893 class MTVIE(InfoExtractor):
2894     """Information extractor for MTV.com"""
2895
2896     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2897     IE_NAME = u'mtv'
2898
2899     def _real_extract(self, url):
2900         mobj = re.match(self._VALID_URL, url)
2901         if mobj is None:
2902             raise ExtractorError(u'Invalid URL: %s' % url)
2903         if not mobj.group('proto'):
2904             url = 'http://' + url
2905         video_id = mobj.group('videoid')
2906
2907         webpage = self._download_webpage(url, video_id)
2908
2909         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2910             webpage, u'song name', fatal=False)
2911
2912         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2913             webpage, u'title')
2914
2915         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2916             webpage, u'mtvn_uri', fatal=False)
2917
2918         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2919             webpage, u'content id', fatal=False)
2920
2921         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2922         self.report_extraction(video_id)
2923         request = compat_urllib_request.Request(videogen_url)
2924         try:
2925             metadataXml = compat_urllib_request.urlopen(request).read()
2926         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2927             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2928
2929         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2930         renditions = mdoc.findall('.//rendition')
2931
2932         # For now, always pick the highest quality.
2933         rendition = renditions[-1]
2934
2935         try:
2936             _,_,ext = rendition.attrib['type'].partition('/')
2937             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2938             video_url = rendition.find('./src').text
2939         except KeyError:
2940             raise ExtractorError('Invalid rendition field.')
2941
2942         info = {
2943             'id': video_id,
2944             'url': video_url,
2945             'uploader': performer,
2946             'upload_date': None,
2947             'title': video_title,
2948             'ext': ext,
2949             'format': format,
2950         }
2951
2952         return [info]
2953
2954
2955 class YoukuIE(InfoExtractor):
2956     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2957
2958     def _gen_sid(self):
2959         nowTime = int(time.time() * 1000)
2960         random1 = random.randint(1000,1998)
2961         random2 = random.randint(1000,9999)
2962
2963         return "%d%d%d" %(nowTime,random1,random2)
2964
2965     def _get_file_ID_mix_string(self, seed):
2966         mixed = []
2967         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2968         seed = float(seed)
2969         for i in range(len(source)):
2970             seed  =  (seed * 211 + 30031 ) % 65536
2971             index  =  math.floor(seed / 65536 * len(source) )
2972             mixed.append(source[int(index)])
2973             source.remove(source[int(index)])
2974         #return ''.join(mixed)
2975         return mixed
2976
2977     def _get_file_id(self, fileId, seed):
2978         mixed = self._get_file_ID_mix_string(seed)
2979         ids = fileId.split('*')
2980         realId = []
2981         for ch in ids:
2982             if ch:
2983                 realId.append(mixed[int(ch)])
2984         return ''.join(realId)
2985
2986     def _real_extract(self, url):
2987         mobj = re.match(self._VALID_URL, url)
2988         if mobj is None:
2989             raise ExtractorError(u'Invalid URL: %s' % url)
2990         video_id = mobj.group('ID')
2991
2992         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2993
2994         jsondata = self._download_webpage(info_url, video_id)
2995
2996         self.report_extraction(video_id)
2997         try:
2998             config = json.loads(jsondata)
2999
3000             video_title =  config['data'][0]['title']
3001             seed = config['data'][0]['seed']
3002
3003             format = self._downloader.params.get('format', None)
3004             supported_format = list(config['data'][0]['streamfileids'].keys())
3005
3006             if format is None or format == 'best':
3007                 if 'hd2' in supported_format:
3008                     format = 'hd2'
3009                 else:
3010                     format = 'flv'
3011                 ext = u'flv'
3012             elif format == 'worst':
3013                 format = 'mp4'
3014                 ext = u'mp4'
3015             else:
3016                 format = 'flv'
3017                 ext = u'flv'
3018
3019
3020             fileid = config['data'][0]['streamfileids'][format]
3021             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3022         except (UnicodeDecodeError, ValueError, KeyError):
3023             raise ExtractorError(u'Unable to extract info section')
3024
3025         files_info=[]
3026         sid = self._gen_sid()
3027         fileid = self._get_file_id(fileid, seed)
3028
3029         #column 8,9 of fileid represent the segment number
3030         #fileid[7:9] should be changed
3031         for index, key in enumerate(keys):
3032
3033             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3034             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3035
3036             info = {
3037                 'id': '%s_part%02d' % (video_id, index),
3038                 'url': download_url,
3039                 'uploader': None,
3040                 'upload_date': None,
3041                 'title': video_title,
3042                 'ext': ext,
3043             }
3044             files_info.append(info)
3045
3046         return files_info
3047
3048
3049 class XNXXIE(InfoExtractor):
3050     """Information extractor for xnxx.com"""
3051
3052     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3053     IE_NAME = u'xnxx'
3054     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3055     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3056     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3057
3058     def _real_extract(self, url):
3059         mobj = re.match(self._VALID_URL, url)
3060         if mobj is None:
3061             raise ExtractorError(u'Invalid URL: %s' % url)
3062         video_id = mobj.group(1)
3063
3064         # Get webpage content
3065         webpage = self._download_webpage(url, video_id)
3066
3067         video_url = self._search_regex(self.VIDEO_URL_RE,
3068             webpage, u'video URL')
3069         video_url = compat_urllib_parse.unquote(video_url)
3070
3071         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3072             webpage, u'title')
3073
3074         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3075             webpage, u'thumbnail', fatal=False)
3076
3077         return [{
3078             'id': video_id,
3079             'url': video_url,
3080             'uploader': None,
3081             'upload_date': None,
3082             'title': video_title,
3083             'ext': 'flv',
3084             'thumbnail': video_thumbnail,
3085             'description': None,
3086         }]
3087
3088
3089 class GooglePlusIE(InfoExtractor):
3090     """Information extractor for plus.google.com."""
3091
3092     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3093     IE_NAME = u'plus.google'
3094
3095     def _real_extract(self, url):
3096         # Extract id from URL
3097         mobj = re.match(self._VALID_URL, url)
3098         if mobj is None:
3099             raise ExtractorError(u'Invalid URL: %s' % url)
3100
3101         post_url = mobj.group(0)
3102         video_id = mobj.group(1)
3103
3104         video_extension = 'flv'
3105
3106         # Step 1, Retrieve post webpage to extract further information
3107         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3108
3109         self.report_extraction(video_id)
3110
3111         # Extract update date
3112         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3113             webpage, u'upload date', fatal=False)
3114         if upload_date:
3115             # Convert timestring to a format suitable for filename
3116             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3117             upload_date = upload_date.strftime('%Y%m%d')
3118
3119         # Extract uploader
3120         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3121             webpage, u'uploader', fatal=False)
3122
3123         # Extract title
3124         # Get the first line for title
3125         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3126             webpage, 'title', default=u'NA')
3127
3128         # Step 2, Stimulate clicking the image box to launch video
3129         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3130             webpage, u'video page URL')
3131         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3132
3133         # Extract video links on video page
3134         """Extract video links of all sizes"""
3135         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3136         mobj = re.findall(pattern, webpage)
3137         if len(mobj) == 0:
3138             raise ExtractorError(u'Unable to extract video links')
3139
3140         # Sort in resolution
3141         links = sorted(mobj)
3142
3143         # Choose the lowest of the sort, i.e. highest resolution
3144         video_url = links[-1]
3145         # Only get the url. The resolution part in the tuple has no use anymore
3146         video_url = video_url[-1]
3147         # Treat escaped \u0026 style hex
3148         try:
3149             video_url = video_url.decode("unicode_escape")
3150         except AttributeError: # Python 3
3151             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3152
3153
3154         return [{
3155             'id':       video_id,
3156             'url':      video_url,
3157             'uploader': uploader,
3158             'upload_date':  upload_date,
3159             'title':    video_title,
3160             'ext':      video_extension,
3161         }]
3162
3163 class NBAIE(InfoExtractor):
3164     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3165     IE_NAME = u'nba'
3166
3167     def _real_extract(self, url):
3168         mobj = re.match(self._VALID_URL, url)
3169         if mobj is None:
3170             raise ExtractorError(u'Invalid URL: %s' % url)
3171
3172         video_id = mobj.group(1)
3173
3174         webpage = self._download_webpage(url, video_id)
3175
3176         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3177
3178         shortened_video_id = video_id.rpartition('/')[2]
3179         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3180             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3181
3182         # It isn't there in the HTML it returns to us
3183         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3184
3185         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3186
3187         info = {
3188             'id': shortened_video_id,
3189             'url': video_url,
3190             'ext': 'mp4',
3191             'title': title,
3192             # 'uploader_date': uploader_date,
3193             'description': description,
3194         }
3195         return [info]
3196
3197 class JustinTVIE(InfoExtractor):
3198     """Information extractor for justin.tv and twitch.tv"""
3199     # TODO: One broadcast may be split into multiple videos. The key
3200     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3201     # starts at 1 and increases. Can we treat all parts as one video?
3202
3203     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3204         (?:
3205             (?P<channelid>[^/]+)|
3206             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3207             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3208         )
3209         /?(?:\#.*)?$
3210         """
3211     _JUSTIN_PAGE_LIMIT = 100
3212     IE_NAME = u'justin.tv'
3213
3214     def report_download_page(self, channel, offset):
3215         """Report attempt to download a single page of videos."""
3216         self.to_screen(u'%s: Downloading video information from %d to %d' %
3217                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3218
3219     # Return count of items, list of *valid* items
3220     def _parse_page(self, url, video_id):
3221         webpage = self._download_webpage(url, video_id,
3222                                          u'Downloading video info JSON',
3223                                          u'unable to download video info JSON')
3224
3225         response = json.loads(webpage)
3226         if type(response) != list:
3227             error_text = response.get('error', 'unknown error')
3228             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3229         info = []
3230         for clip in response:
3231             video_url = clip['video_file_url']
3232             if video_url:
3233                 video_extension = os.path.splitext(video_url)[1][1:]
3234                 video_date = re.sub('-', '', clip['start_time'][:10])
3235                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3236                 video_id = clip['id']
3237                 video_title = clip.get('title', video_id)
3238                 info.append({
3239                     'id': video_id,
3240                     'url': video_url,
3241                     'title': video_title,
3242                     'uploader': clip.get('channel_name', video_uploader_id),
3243                     'uploader_id': video_uploader_id,
3244                     'upload_date': video_date,
3245                     'ext': video_extension,
3246                 })
3247         return (len(response), info)
3248
3249     def _real_extract(self, url):
3250         mobj = re.match(self._VALID_URL, url)
3251         if mobj is None:
3252             raise ExtractorError(u'invalid URL: %s' % url)
3253
3254         api_base = 'http://api.justin.tv'
3255         paged = False
3256         if mobj.group('channelid'):
3257             paged = True
3258             video_id = mobj.group('channelid')
3259             api = api_base + '/channel/archives/%s.json' % video_id
3260         elif mobj.group('chapterid'):
3261             chapter_id = mobj.group('chapterid')
3262
3263             webpage = self._download_webpage(url, chapter_id)
3264             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3265             if not m:
3266                 raise ExtractorError(u'Cannot find archive of a chapter')
3267             archive_id = m.group(1)
3268
3269             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3270             chapter_info_xml = self._download_webpage(api, chapter_id,
3271                                              note=u'Downloading chapter information',
3272                                              errnote=u'Chapter information download failed')
3273             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3274             for a in doc.findall('.//archive'):
3275                 if archive_id == a.find('./id').text:
3276                     break
3277             else:
3278                 raise ExtractorError(u'Could not find chapter in chapter information')
3279
3280             video_url = a.find('./video_file_url').text
3281             video_ext = video_url.rpartition('.')[2] or u'flv'
3282
3283             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3284             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3285                                    note='Downloading chapter metadata',
3286                                    errnote='Download of chapter metadata failed')
3287             chapter_info = json.loads(chapter_info_json)
3288
3289             bracket_start = int(doc.find('.//bracket_start').text)
3290             bracket_end = int(doc.find('.//bracket_end').text)
3291
3292             # TODO determine start (and probably fix up file)
3293             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3294             #video_url += u'?start=' + TODO:start_timestamp
3295             # bracket_start is 13290, but we want 51670615
3296             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3297                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3298
3299             info = {
3300                 'id': u'c' + chapter_id,
3301                 'url': video_url,
3302                 'ext': video_ext,
3303                 'title': chapter_info['title'],
3304                 'thumbnail': chapter_info['preview'],
3305                 'description': chapter_info['description'],
3306                 'uploader': chapter_info['channel']['display_name'],
3307                 'uploader_id': chapter_info['channel']['name'],
3308             }
3309             return [info]
3310         else:
3311             video_id = mobj.group('videoid')
3312             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3313
3314         self.report_extraction(video_id)
3315
3316         info = []
3317         offset = 0
3318         limit = self._JUSTIN_PAGE_LIMIT
3319         while True:
3320             if paged:
3321                 self.report_download_page(video_id, offset)
3322             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3323             page_count, page_info = self._parse_page(page_url, video_id)
3324             info.extend(page_info)
3325             if not paged or page_count != limit:
3326                 break
3327             offset += limit
3328         return info
3329
3330 class FunnyOrDieIE(InfoExtractor):
3331     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3332
3333     def _real_extract(self, url):
3334         mobj = re.match(self._VALID_URL, url)
3335         if mobj is None:
3336             raise ExtractorError(u'invalid URL: %s' % url)
3337
3338         video_id = mobj.group('id')
3339         webpage = self._download_webpage(url, video_id)
3340
3341         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3342             webpage, u'video URL', flags=re.DOTALL)
3343
3344         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3345             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3346
3347         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3348             webpage, u'description', fatal=False, flags=re.DOTALL)
3349
3350         info = {
3351             'id': video_id,
3352             'url': video_url,
3353             'ext': 'mp4',
3354             'title': title,
3355             'description': video_description,
3356         }
3357         return [info]
3358
3359 class SteamIE(InfoExtractor):
3360     _VALID_URL = r"""http://store\.steampowered\.com/
3361                 (agecheck/)?
3362                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3363                 (?P<gameID>\d+)/?
3364                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3365                 """
3366     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3367     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3368
3369     @classmethod
3370     def suitable(cls, url):
3371         """Receives a URL and returns True if suitable for this IE."""
3372         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3373
3374     def _real_extract(self, url):
3375         m = re.match(self._VALID_URL, url, re.VERBOSE)
3376         gameID = m.group('gameID')
3377
3378         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3379         webpage = self._download_webpage(videourl, gameID)
3380
3381         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3382             videourl = self._AGECHECK_TEMPLATE % gameID
3383             self.report_age_confirmation()
3384             webpage = self._download_webpage(videourl, gameID)
3385
3386         self.report_extraction(gameID)
3387         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3388                                              webpage, 'game title')
3389
3390         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3391         mweb = re.finditer(urlRE, webpage)
3392         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3393         titles = re.finditer(namesRE, webpage)
3394         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3395         thumbs = re.finditer(thumbsRE, webpage)
3396         videos = []
3397         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3398             video_id = vid.group('videoID')
3399             title = vtitle.group('videoName')
3400             video_url = vid.group('videoURL')
3401             video_thumb = thumb.group('thumbnail')
3402             if not video_url:
3403                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3404             info = {
3405                 'id':video_id,
3406                 'url':video_url,
3407                 'ext': 'flv',
3408                 'title': unescapeHTML(title),
3409                 'thumbnail': video_thumb
3410                   }
3411             videos.append(info)
3412         return [self.playlist_result(videos, gameID, game_title)]
3413
3414 class UstreamIE(InfoExtractor):
3415     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3416     IE_NAME = u'ustream'
3417
3418     def _real_extract(self, url):
3419         m = re.match(self._VALID_URL, url)
3420         video_id = m.group('videoID')
3421
3422         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3423         webpage = self._download_webpage(url, video_id)
3424
3425         self.report_extraction(video_id)
3426
3427         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3428             webpage, u'title')
3429
3430         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3431             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3432
3433         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3434             webpage, u'thumbnail', fatal=False)
3435
3436         info = {
3437                 'id': video_id,
3438                 'url': video_url,
3439                 'ext': 'flv',
3440                 'title': video_title,
3441                 'uploader': uploader,
3442                 'thumbnail': thumbnail,
3443                }
3444         return info
3445
3446 class WorldStarHipHopIE(InfoExtractor):
3447     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3448     IE_NAME = u'WorldStarHipHop'
3449
3450     def _real_extract(self, url):
3451         m = re.match(self._VALID_URL, url)
3452         video_id = m.group('id')
3453
3454         webpage_src = self._download_webpage(url, video_id)
3455
3456         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3457             webpage_src, u'video URL')
3458
3459         if 'mp4' in video_url:
3460             ext = 'mp4'
3461         else:
3462             ext = 'flv'
3463
3464         video_title = self._html_search_regex(r"<title>(.*)</title>",
3465             webpage_src, u'title')
3466
3467         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3468         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3469             webpage_src, u'thumbnail', fatal=False)
3470
3471         if not thumbnail:
3472             _title = r"""candytitles.*>(.*)</span>"""
3473             mobj = re.search(_title, webpage_src)
3474             if mobj is not None:
3475                 video_title = mobj.group(1)
3476
3477         results = [{
3478                     'id': video_id,
3479                     'url' : video_url,
3480                     'title' : video_title,
3481                     'thumbnail' : thumbnail,
3482                     'ext' : ext,
3483                     }]
3484         return results
3485
3486 class RBMARadioIE(InfoExtractor):
3487     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3488
3489     def _real_extract(self, url):
3490         m = re.match(self._VALID_URL, url)
3491         video_id = m.group('videoID')
3492
3493         webpage = self._download_webpage(url, video_id)
3494
3495         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3496             webpage, u'json data', flags=re.MULTILINE)
3497
3498         try:
3499             data = json.loads(json_data)
3500         except ValueError as e:
3501             raise ExtractorError(u'Invalid JSON: ' + str(e))
3502
3503         video_url = data['akamai_url'] + '&cbr=256'
3504         url_parts = compat_urllib_parse_urlparse(video_url)
3505         video_ext = url_parts.path.rpartition('.')[2]
3506         info = {
3507                 'id': video_id,
3508                 'url': video_url,
3509                 'ext': video_ext,
3510                 'title': data['title'],
3511                 'description': data.get('teaser_text'),
3512                 'location': data.get('country_of_origin'),
3513                 'uploader': data.get('host', {}).get('name'),
3514                 'uploader_id': data.get('host', {}).get('slug'),
3515                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3516                 'duration': data.get('duration'),
3517         }
3518         return [info]
3519
3520
3521 class YouPornIE(InfoExtractor):
3522     """Information extractor for youporn.com."""
3523     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3524
3525     def _print_formats(self, formats):
3526         """Print all available formats"""
3527         print(u'Available formats:')
3528         print(u'ext\t\tformat')
3529         print(u'---------------------------------')
3530         for format in formats:
3531             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3532
3533     def _specific(self, req_format, formats):
3534         for x in formats:
3535             if(x["format"]==req_format):
3536                 return x
3537         return None
3538
3539     def _real_extract(self, url):
3540         mobj = re.match(self._VALID_URL, url)
3541         if mobj is None:
3542             raise ExtractorError(u'Invalid URL: %s' % url)
3543         video_id = mobj.group('videoid')
3544
3545         req = compat_urllib_request.Request(url)
3546         req.add_header('Cookie', 'age_verified=1')
3547         webpage = self._download_webpage(req, video_id)
3548
3549         # Get JSON parameters
3550         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3551         try:
3552             params = json.loads(json_params)
3553         except:
3554             raise ExtractorError(u'Invalid JSON')
3555
3556         self.report_extraction(video_id)
3557         try:
3558             video_title = params['title']
3559             upload_date = unified_strdate(params['release_date_f'])
3560             video_description = params['description']
3561             video_uploader = params['submitted_by']
3562             thumbnail = params['thumbnails'][0]['image']
3563         except KeyError:
3564             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3565
3566         # Get all of the formats available
3567         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3568         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3569             webpage, u'download list').strip()
3570
3571         # Get all of the links from the page
3572         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3573         links = re.findall(LINK_RE, download_list_html)
3574         if(len(links) == 0):
3575             raise ExtractorError(u'ERROR: no known formats available for video')
3576
3577         self.to_screen(u'Links found: %d' % len(links))
3578
3579         formats = []
3580         for link in links:
3581
3582             # A link looks like this:
3583             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3584             # A path looks like this:
3585             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3586             video_url = unescapeHTML( link )
3587             path = compat_urllib_parse_urlparse( video_url ).path
3588             extension = os.path.splitext( path )[1][1:]
3589             format = path.split('/')[4].split('_')[:2]
3590             size = format[0]
3591             bitrate = format[1]
3592             format = "-".join( format )
3593             # title = u'%s-%s-%s' % (video_title, size, bitrate)
3594
3595             formats.append({
3596                 'id': video_id,
3597                 'url': video_url,
3598                 'uploader': video_uploader,
3599                 'upload_date': upload_date,
3600                 'title': video_title,
3601                 'ext': extension,
3602                 'format': format,
3603                 'thumbnail': thumbnail,
3604                 'description': video_description
3605             })
3606
3607         if self._downloader.params.get('listformats', None):
3608             self._print_formats(formats)
3609             return
3610
3611         req_format = self._downloader.params.get('format', None)
3612         self.to_screen(u'Format: %s' % req_format)
3613
3614         if req_format is None or req_format == 'best':
3615             return [formats[0]]
3616         elif req_format == 'worst':
3617             return [formats[-1]]
3618         elif req_format in ('-1', 'all'):
3619             return formats
3620         else:
3621             format = self._specific( req_format, formats )
3622             if result is None:
3623                 raise ExtractorError(u'Requested format not available')
3624             return [format]
3625
3626
3627
3628 class PornotubeIE(InfoExtractor):
3629     """Information extractor for pornotube.com."""
3630     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3631
3632     def _real_extract(self, url):
3633         mobj = re.match(self._VALID_URL, url)
3634         if mobj is None:
3635             raise ExtractorError(u'Invalid URL: %s' % url)
3636
3637         video_id = mobj.group('videoid')
3638         video_title = mobj.group('title')
3639
3640         # Get webpage content
3641         webpage = self._download_webpage(url, video_id)
3642
3643         # Get the video URL
3644         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3645         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3646         video_url = compat_urllib_parse.unquote(video_url)
3647
3648         #Get the uploaded date
3649         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3650         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3651         if upload_date: upload_date = unified_strdate(upload_date)
3652
3653         info = {'id': video_id,
3654                 'url': video_url,
3655                 'uploader': None,
3656                 'upload_date': upload_date,
3657                 'title': video_title,
3658                 'ext': 'flv',
3659                 'format': 'flv'}
3660
3661         return [info]
3662
3663 class YouJizzIE(InfoExtractor):
3664     """Information extractor for youjizz.com."""
3665     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3666
3667     def _real_extract(self, url):
3668         mobj = re.match(self._VALID_URL, url)
3669         if mobj is None:
3670             raise ExtractorError(u'Invalid URL: %s' % url)
3671
3672         video_id = mobj.group('videoid')
3673
3674         # Get webpage content
3675         webpage = self._download_webpage(url, video_id)
3676
3677         # Get the video title
3678         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3679             webpage, u'title').strip()
3680
3681         # Get the embed page
3682         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3683         if result is None:
3684             raise ExtractorError(u'ERROR: unable to extract embed page')
3685
3686         embed_page_url = result.group(0).strip()
3687         video_id = result.group('videoid')
3688
3689         webpage = self._download_webpage(embed_page_url, video_id)
3690
3691         # Get the video URL
3692         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3693             webpage, u'video URL')
3694
3695         info = {'id': video_id,
3696                 'url': video_url,
3697                 'title': video_title,
3698                 'ext': 'flv',
3699                 'format': 'flv',
3700                 'player_url': embed_page_url}
3701
3702         return [info]
3703
3704 class EightTracksIE(InfoExtractor):
3705     IE_NAME = '8tracks'
3706     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3707
3708     def _real_extract(self, url):
3709         mobj = re.match(self._VALID_URL, url)
3710         if mobj is None:
3711             raise ExtractorError(u'Invalid URL: %s' % url)
3712         playlist_id = mobj.group('id')
3713
3714         webpage = self._download_webpage(url, playlist_id)
3715
3716         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3717         data = json.loads(json_like)
3718
3719         session = str(random.randint(0, 1000000000))
3720         mix_id = data['id']
3721         track_count = data['tracks_count']
3722         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3723         next_url = first_url
3724         res = []
3725         for i in itertools.count():
3726             api_json = self._download_webpage(next_url, playlist_id,
3727                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3728                 errnote=u'Failed to download song information')
3729             api_data = json.loads(api_json)
3730             track_data = api_data[u'set']['track']
3731             info = {
3732                 'id': track_data['id'],
3733                 'url': track_data['track_file_stream_url'],
3734                 'title': track_data['performer'] + u' - ' + track_data['name'],
3735                 'raw_title': track_data['name'],
3736                 'uploader_id': data['user']['login'],
3737                 'ext': 'm4a',
3738             }
3739             res.append(info)
3740             if api_data['set']['at_last_track']:
3741                 break
3742             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3743         return res
3744
3745 class KeekIE(InfoExtractor):
3746     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3747     IE_NAME = u'keek'
3748
3749     def _real_extract(self, url):
3750         m = re.match(self._VALID_URL, url)
3751         video_id = m.group('videoID')
3752
3753         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3754         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3755         webpage = self._download_webpage(url, video_id)
3756
3757         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3758             webpage, u'title')
3759
3760         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3761             webpage, u'uploader', fatal=False)
3762
3763         info = {
3764                 'id': video_id,
3765                 'url': video_url,
3766                 'ext': 'mp4',
3767                 'title': video_title,
3768                 'thumbnail': thumbnail,
3769                 'uploader': uploader
3770         }
3771         return [info]
3772
3773 class TEDIE(InfoExtractor):
3774     _VALID_URL=r'''http://www\.ted\.com/
3775                    (
3776                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3777                         |
3778                         ((?P<type_talk>talks)) # We have a simple talk
3779                    )
3780                    (/lang/(.*?))? # The url may contain the language
3781                    /(?P<name>\w+) # Here goes the name and then ".html"
3782                    '''
3783
3784     @classmethod
3785     def suitable(cls, url):
3786         """Receives a URL and returns True if suitable for this IE."""
3787         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3788
3789     def _real_extract(self, url):
3790         m=re.match(self._VALID_URL, url, re.VERBOSE)
3791         if m.group('type_talk'):
3792             return [self._talk_info(url)]
3793         else :
3794             playlist_id=m.group('playlist_id')
3795             name=m.group('name')
3796             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3797             return [self._playlist_videos_info(url,name,playlist_id)]
3798
3799     def _playlist_videos_info(self,url,name,playlist_id=0):
3800         '''Returns the videos of the playlist'''
3801         video_RE=r'''
3802                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3803                      ([.\s]*?)data-playlist_item_id="(\d+)"
3804                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3805                      '''
3806         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3807         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3808         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3809         m_names=re.finditer(video_name_RE,webpage)
3810
3811         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3812                                                  webpage, 'playlist title')
3813
3814         playlist_entries = []
3815         for m_video, m_name in zip(m_videos,m_names):
3816             video_id=m_video.group('video_id')
3817             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3818             playlist_entries.append(self.url_result(talk_url, 'TED'))
3819         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3820
3821     def _talk_info(self, url, video_id=0):
3822         """Return the video for the talk in the url"""
3823         m = re.match(self._VALID_URL, url,re.VERBOSE)
3824         video_name = m.group('name')
3825         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3826         self.report_extraction(video_name)
3827         # If the url includes the language we get the title translated
3828         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3829                                         webpage, 'title')
3830         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3831                                     webpage, 'json data')
3832         info = json.loads(json_data)
3833         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3834                                        webpage, 'description', flags = re.DOTALL)
3835
3836         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3837                                        webpage, 'thumbnail')
3838         info = {
3839                 'id': info['id'],
3840                 'url': info['htmlStreams'][-1]['file'],
3841                 'ext': 'mp4',
3842                 'title': title,
3843                 'thumbnail': thumbnail,
3844                 'description': desc,
3845                 }
3846         return info
3847
3848 class MySpassIE(InfoExtractor):
3849     _VALID_URL = r'http://www.myspass.de/.*'
3850
3851     def _real_extract(self, url):
3852         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3853
3854         # video id is the last path element of the URL
3855         # usually there is a trailing slash, so also try the second but last
3856         url_path = compat_urllib_parse_urlparse(url).path
3857         url_parent_path, video_id = os.path.split(url_path)
3858         if not video_id:
3859             _, video_id = os.path.split(url_parent_path)
3860
3861         # get metadata
3862         metadata_url = META_DATA_URL_TEMPLATE % video_id
3863         metadata_text = self._download_webpage(metadata_url, video_id)
3864         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3865
3866         # extract values from metadata
3867         url_flv_el = metadata.find('url_flv')
3868         if url_flv_el is None:
3869             raise ExtractorError(u'Unable to extract download url')
3870         video_url = url_flv_el.text
3871         extension = os.path.splitext(video_url)[1][1:]
3872         title_el = metadata.find('title')
3873         if title_el is None:
3874             raise ExtractorError(u'Unable to extract title')
3875         title = title_el.text
3876         format_id_el = metadata.find('format_id')
3877         if format_id_el is None:
3878             format = ext
3879         else:
3880             format = format_id_el.text
3881         description_el = metadata.find('description')
3882         if description_el is not None:
3883             description = description_el.text
3884         else:
3885             description = None
3886         imagePreview_el = metadata.find('imagePreview')
3887         if imagePreview_el is not None:
3888             thumbnail = imagePreview_el.text
3889         else:
3890             thumbnail = None
3891         info = {
3892             'id': video_id,
3893             'url': video_url,
3894             'title': title,
3895             'ext': extension,
3896             'format': format,
3897             'thumbnail': thumbnail,
3898             'description': description
3899         }
3900         return [info]
3901
3902 class SpiegelIE(InfoExtractor):
3903     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3904
3905     def _real_extract(self, url):
3906         m = re.match(self._VALID_URL, url)
3907         video_id = m.group('videoID')
3908
3909         webpage = self._download_webpage(url, video_id)
3910
3911         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3912             webpage, u'title')
3913
3914         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3915         xml_code = self._download_webpage(xml_url, video_id,
3916                     note=u'Downloading XML', errnote=u'Failed to download XML')
3917
3918         idoc = xml.etree.ElementTree.fromstring(xml_code)
3919         last_type = idoc[-1]
3920         filename = last_type.findall('./filename')[0].text
3921         duration = float(last_type.findall('./duration')[0].text)
3922
3923         video_url = 'http://video2.spiegel.de/flash/' + filename
3924         video_ext = filename.rpartition('.')[2]
3925         info = {
3926             'id': video_id,
3927             'url': video_url,
3928             'ext': video_ext,
3929             'title': video_title,
3930             'duration': duration,
3931         }
3932         return [info]
3933
3934 class LiveLeakIE(InfoExtractor):
3935
3936     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3937     IE_NAME = u'liveleak'
3938
3939     def _real_extract(self, url):
3940         mobj = re.match(self._VALID_URL, url)
3941         if mobj is None:
3942             raise ExtractorError(u'Invalid URL: %s' % url)
3943
3944         video_id = mobj.group('video_id')
3945
3946         webpage = self._download_webpage(url, video_id)
3947
3948         video_url = self._search_regex(r'file: "(.*?)",',
3949             webpage, u'video URL')
3950
3951         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3952             webpage, u'title').replace('LiveLeak.com -', '').strip()
3953
3954         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3955             webpage, u'description', fatal=False)
3956
3957         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3958             webpage, u'uploader', fatal=False)
3959
3960         info = {
3961             'id':  video_id,
3962             'url': video_url,
3963             'ext': 'mp4',
3964             'title': video_title,
3965             'description': video_description,
3966             'uploader': video_uploader
3967         }
3968
3969         return [info]
3970
3971 class ARDIE(InfoExtractor):
3972     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3973     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3974     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3975
3976     def _real_extract(self, url):
3977         # determine video id from url
3978         m = re.match(self._VALID_URL, url)
3979
3980         numid = re.search(r'documentId=([0-9]+)', url)
3981         if numid:
3982             video_id = numid.group(1)
3983         else:
3984             video_id = m.group('video_id')
3985
3986         # determine title and media streams from webpage
3987         html = self._download_webpage(url, video_id)
3988         title = re.search(self._TITLE, html).group('title')
3989         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3990         if not streams:
3991             assert '"fsk"' in html
3992             raise ExtractorError(u'This video is only available after 8:00 pm')
3993
3994         # choose default media type and highest quality for now
3995         stream = max([s for s in streams if int(s["media_type"]) == 0],
3996                      key=lambda s: int(s["quality"]))
3997
3998         # there's two possibilities: RTMP stream or HTTP download
3999         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4000         if stream['rtmp_url']:
4001             self.to_screen(u'RTMP download detected')
4002             assert stream['video_url'].startswith('mp4:')
4003             info["url"] = stream["rtmp_url"]
4004             info["play_path"] = stream['video_url']
4005         else:
4006             assert stream["video_url"].endswith('.mp4')
4007             info["url"] = stream["video_url"]
4008         return [info]
4009
4010 class ZDFIE(InfoExtractor):
4011     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4012     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4013     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4014     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4015     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4016
4017     def _real_extract(self, url):
4018         mobj = re.match(self._VALID_URL, url)
4019         if mobj is None:
4020             raise ExtractorError(u'Invalid URL: %s' % url)
4021         video_id = mobj.group('video_id')
4022
4023         html = self._download_webpage(url, video_id)
4024         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4025         if streams is None:
4026             raise ExtractorError(u'No media url found.')
4027
4028         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4029         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4030         # choose first/default media type and highest quality for now
4031         for s in streams:        #find 300 - dsl1000mbit
4032             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4033                 stream_=s
4034                 break
4035         for s in streams:        #find veryhigh - dsl2000mbit
4036             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4037                 stream_=s
4038                 break
4039         if stream_ is None:
4040             raise ExtractorError(u'No stream found.')
4041
4042         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4043
4044         self.report_extraction(video_id)
4045         mobj = re.search(self._TITLE, html)
4046         if mobj is None:
4047             raise ExtractorError(u'Cannot extract title')
4048         title = unescapeHTML(mobj.group('title'))
4049
4050         mobj = re.search(self._MMS_STREAM, media_link)
4051         if mobj is None:
4052             mobj = re.search(self._RTSP_STREAM, media_link)
4053             if mobj is None:
4054                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4055         mms_url = mobj.group('video_url')
4056
4057         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4058         if mobj is None:
4059             raise ExtractorError(u'Cannot extract extention')
4060         ext = mobj.group('ext')
4061
4062         return [{'id': video_id,
4063                  'url': mms_url,
4064                  'title': title,
4065                  'ext': ext
4066                  }]
4067
4068 class TumblrIE(InfoExtractor):
4069     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4070
4071     def _real_extract(self, url):
4072         m_url = re.match(self._VALID_URL, url)
4073         video_id = m_url.group('id')
4074         blog = m_url.group('blog_name')
4075
4076         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4077         webpage = self._download_webpage(url, video_id)
4078
4079         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4080         video = re.search(re_video, webpage)
4081         if video is None:
4082            raise ExtractorError(u'Unable to extract video')
4083         video_url = video.group('video_url')
4084         ext = video.group('ext')
4085
4086         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4087             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4088         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4089
4090         # The only place where you can get a title, it's not complete,
4091         # but searching in other places doesn't work for all videos
4092         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4093             webpage, u'title', flags=re.DOTALL)
4094
4095         return [{'id': video_id,
4096                  'url': video_url,
4097                  'title': video_title,
4098                  'thumbnail': video_thumbnail,
4099                  'ext': ext
4100                  }]
4101
4102 class BandcampIE(InfoExtractor):
4103     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4104
4105     def _real_extract(self, url):
4106         mobj = re.match(self._VALID_URL, url)
4107         title = mobj.group('title')
4108         webpage = self._download_webpage(url, title)
4109         # We get the link to the free download page
4110         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4111         if m_download is None:
4112             raise ExtractorError(u'No free songs found')
4113
4114         download_link = m_download.group(1)
4115         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4116                        webpage, re.MULTILINE|re.DOTALL).group('id')
4117
4118         download_webpage = self._download_webpage(download_link, id,
4119                                                   'Downloading free downloads page')
4120         # We get the dictionary of the track from some javascrip code
4121         info = re.search(r'items: (.*?),$',
4122                          download_webpage, re.MULTILINE).group(1)
4123         info = json.loads(info)[0]
4124         # We pick mp3-320 for now, until format selection can be easily implemented.
4125         mp3_info = info[u'downloads'][u'mp3-320']
4126         # If we try to use this url it says the link has expired
4127         initial_url = mp3_info[u'url']
4128         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4129         m_url = re.match(re_url, initial_url)
4130         #We build the url we will use to get the final track url
4131         # This url is build in Bandcamp in the script download_bunde_*.js
4132         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4133         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4134         # If we could correctly generate the .rand field the url would be
4135         #in the "download_url" key
4136         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4137
4138         track_info = {'id':id,
4139                       'title' : info[u'title'],
4140                       'ext' :   'mp3',
4141                       'url' :   final_url,
4142                       'thumbnail' : info[u'thumb_url'],
4143                       'uploader' :  info[u'artist']
4144                       }
4145
4146         return [track_info]
4147
4148 class RedTubeIE(InfoExtractor):
4149     """Information Extractor for redtube"""
4150     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4151
4152     def _real_extract(self,url):
4153         mobj = re.match(self._VALID_URL, url)
4154         if mobj is None:
4155             raise ExtractorError(u'Invalid URL: %s' % url)
4156
4157         video_id = mobj.group('id')
4158         video_extension = 'mp4'
4159         webpage = self._download_webpage(url, video_id)
4160
4161         self.report_extraction(video_id)
4162
4163         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4164             webpage, u'video URL')
4165
4166         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4167             webpage, u'title')
4168
4169         return [{
4170             'id':       video_id,
4171             'url':      video_url,
4172             'ext':      video_extension,
4173             'title':    video_title,
4174         }]
4175
4176 class InaIE(InfoExtractor):
4177     """Information Extractor for Ina.fr"""
4178     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4179
4180     def _real_extract(self,url):
4181         mobj = re.match(self._VALID_URL, url)
4182
4183         video_id = mobj.group('id')
4184         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4185         video_extension = 'mp4'
4186         webpage = self._download_webpage(mrss_url, video_id)
4187
4188         self.report_extraction(video_id)
4189
4190         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4191             webpage, u'video URL')
4192
4193         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4194             webpage, u'title')
4195
4196         return [{
4197             'id':       video_id,
4198             'url':      video_url,
4199             'ext':      video_extension,
4200             'title':    video_title,
4201         }]
4202
4203 class HowcastIE(InfoExtractor):
4204     """Information Extractor for Howcast.com"""
4205     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4206
4207     def _real_extract(self, url):
4208         mobj = re.match(self._VALID_URL, url)
4209
4210         video_id = mobj.group('id')
4211         webpage_url = 'http://www.howcast.com/videos/' + video_id
4212         webpage = self._download_webpage(webpage_url, video_id)
4213
4214         self.report_extraction(video_id)
4215
4216         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4217             webpage, u'video URL')
4218
4219         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4220             webpage, u'title')
4221
4222         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4223             webpage, u'description', fatal=False)
4224
4225         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4226             webpage, u'thumbnail', fatal=False)
4227
4228         return [{
4229             'id':       video_id,
4230             'url':      video_url,
4231             'ext':      'mp4',
4232             'title':    video_title,
4233             'description': video_description,
4234             'thumbnail': thumbnail,
4235         }]
4236
4237 class VineIE(InfoExtractor):
4238     """Information Extractor for Vine.co"""
4239     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4240
4241     def _real_extract(self, url):
4242         mobj = re.match(self._VALID_URL, url)
4243
4244         video_id = mobj.group('id')
4245         webpage_url = 'https://vine.co/v/' + video_id
4246         webpage = self._download_webpage(webpage_url, video_id)
4247
4248         self.report_extraction(video_id)
4249
4250         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4251             webpage, u'video URL')
4252
4253         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4254             webpage, u'title')
4255
4256         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4257             webpage, u'thumbnail', fatal=False)
4258
4259         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4260             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4261
4262         return [{
4263             'id':        video_id,
4264             'url':       video_url,
4265             'ext':       'mp4',
4266             'title':     video_title,
4267             'thumbnail': thumbnail,
4268             'uploader':  uploader,
4269         }]
4270
4271 class FlickrIE(InfoExtractor):
4272     """Information Extractor for Flickr videos"""
4273     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4274
4275     def _real_extract(self, url):
4276         mobj = re.match(self._VALID_URL, url)
4277
4278         video_id = mobj.group('id')
4279         video_uploader_id = mobj.group('uploader_id')
4280         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4281         webpage = self._download_webpage(webpage_url, video_id)
4282
4283         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4284
4285         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4286         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4287
4288         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4289             first_xml, u'node_id')
4290
4291         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4292         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4293
4294         self.report_extraction(video_id)
4295
4296         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4297         if mobj is None:
4298             raise ExtractorError(u'Unable to extract video url')
4299         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4300
4301         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4302             webpage, u'video title')
4303
4304         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4305             webpage, u'description', fatal=False)
4306
4307         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4308             webpage, u'thumbnail', fatal=False)
4309
4310         return [{
4311             'id':          video_id,
4312             'url':         video_url,
4313             'ext':         'mp4',
4314             'title':       video_title,
4315             'description': video_description,
4316             'thumbnail':   thumbnail,
4317             'uploader_id': video_uploader_id,
4318         }]
4319
4320 class TeamcocoIE(InfoExtractor):
4321     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4322
4323     def _real_extract(self, url):
4324         mobj = re.match(self._VALID_URL, url)
4325         if mobj is None:
4326             raise ExtractorError(u'Invalid URL: %s' % url)
4327         url_title = mobj.group('url_title')
4328         webpage = self._download_webpage(url, url_title)
4329
4330         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4331             webpage, u'video id')
4332
4333         self.report_extraction(video_id)
4334
4335         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4336             webpage, u'title')
4337
4338         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4339             webpage, u'thumbnail', fatal=False)
4340
4341         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4342             webpage, u'description', fatal=False)
4343
4344         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4345         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4346
4347         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4348             data, u'video URL')
4349
4350         return [{
4351             'id':          video_id,
4352             'url':         video_url,
4353             'ext':         'mp4',
4354             'title':       video_title,
4355             'thumbnail':   thumbnail,
4356             'description': video_description,
4357         }]
4358
4359 class XHamsterIE(InfoExtractor):
4360     """Information Extractor for xHamster"""
4361     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4362
4363     def _real_extract(self,url):
4364         mobj = re.match(self._VALID_URL, url)
4365
4366         video_id = mobj.group('id')
4367         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4368         webpage = self._download_webpage(mrss_url, video_id)
4369
4370         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4371         if mobj is None:
4372             raise ExtractorError(u'Unable to extract media URL')
4373         if len(mobj.group('server')) == 0:
4374             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4375         else:
4376             video_url = mobj.group('server')+'/key='+mobj.group('file')
4377         video_extension = video_url.split('.')[-1]
4378
4379         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4380             webpage, u'title')
4381
4382         # Can't see the description anywhere in the UI
4383         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4384         #     webpage, u'description', fatal=False)
4385         # if video_description: video_description = unescapeHTML(video_description)
4386
4387         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4388         if mobj:
4389             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4390         else:
4391             video_upload_date = None
4392             self._downloader.report_warning(u'Unable to extract upload date')
4393
4394         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4395             webpage, u'uploader id', default=u'anonymous')
4396
4397         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4398             webpage, u'thumbnail', fatal=False)
4399
4400         return [{
4401             'id':       video_id,
4402             'url':      video_url,
4403             'ext':      video_extension,
4404             'title':    video_title,
4405             # 'description': video_description,
4406             'upload_date': video_upload_date,
4407             'uploader_id': video_uploader_id,
4408             'thumbnail': video_thumbnail
4409         }]
4410
4411 class HypemIE(InfoExtractor):
4412     """Information Extractor for hypem"""
4413     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4414
4415     def _real_extract(self, url):
4416         mobj = re.match(self._VALID_URL, url)
4417         if mobj is None:
4418             raise ExtractorError(u'Invalid URL: %s' % url)
4419         track_id = mobj.group(1)
4420
4421         data = { 'ax': 1, 'ts': time.time() }
4422         data_encoded = compat_urllib_parse.urlencode(data)
4423         complete_url = url + "?" + data_encoded
4424         request = compat_urllib_request.Request(complete_url)
4425         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4426         cookie = urlh.headers.get('Set-Cookie', '')
4427
4428         self.report_extraction(track_id)
4429
4430         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4431             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4432         try:
4433             track_list = json.loads(html_tracks)
4434             track = track_list[u'tracks'][0]
4435         except ValueError:
4436             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4437
4438         key = track[u"key"]
4439         track_id = track[u"id"]
4440         artist = track[u"artist"]
4441         title = track[u"song"]
4442
4443         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4444         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4445         request.add_header('cookie', cookie)
4446         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4447         try:
4448             song_data = json.loads(song_data_json)
4449         except ValueError:
4450             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4451         final_url = song_data[u"url"]
4452
4453         return [{
4454             'id':       track_id,
4455             'url':      final_url,
4456             'ext':      "mp3",
4457             'title':    title,
4458             'artist':   artist,
4459         }]
4460
4461 class Vbox7IE(InfoExtractor):
4462     """Information Extractor for Vbox7"""
4463     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4464
4465     def _real_extract(self,url):
4466         mobj = re.match(self._VALID_URL, url)
4467         if mobj is None:
4468             raise ExtractorError(u'Invalid URL: %s' % url)
4469         video_id = mobj.group(1)
4470
4471         redirect_page, urlh = self._download_webpage_handle(url, video_id)
4472         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4473         redirect_url = urlh.geturl() + new_location
4474         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4475
4476         title = self._html_search_regex(r'<title>(.*)</title>',
4477             webpage, u'title').split('/')[0].strip()
4478
4479         ext = "flv"
4480         info_url = "http://vbox7.com/play/magare.do"
4481         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4482         info_request = compat_urllib_request.Request(info_url, data)
4483         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4484         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4485         if info_response is None:
4486             raise ExtractorError(u'Unable to extract the media url')
4487         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4488
4489         return [{
4490             'id':        video_id,
4491             'url':       final_url,
4492             'ext':       ext,
4493             'title':     title,
4494             'thumbnail': thumbnail_url,
4495         }]
4496
4497 class GametrailersIE(InfoExtractor):
4498     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4499
4500     def _real_extract(self, url):
4501         mobj = re.match(self._VALID_URL, url)
4502         if mobj is None:
4503             raise ExtractorError(u'Invalid URL: %s' % url)
4504         video_id = mobj.group('id')
4505         video_type = mobj.group('type')
4506         webpage = self._download_webpage(url, video_id)
4507         if video_type == 'full-episodes':
4508             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4509         else:
4510             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4511         mgid = self._search_regex(mgid_re, webpage, u'mgid')
4512         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4513
4514         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4515                                            video_id, u'Downloading video info')
4516         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4517                                                video_id, u'Downloading video urls info')
4518
4519         self.report_extraction(video_id)
4520         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4521                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4522                       <image>.*
4523                         <url>(?P<thumb>.*?)</url>.*
4524                       </image>'''
4525
4526         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4527         if m_info is None:
4528             raise ExtractorError(u'Unable to extract video info')
4529         video_title = m_info.group('title')
4530         video_description = m_info.group('description')
4531         video_thumb = m_info.group('thumb')
4532
4533         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4534         if m_urls is None or len(m_urls) == 0:
4535             raise ExtractError(u'Unable to extrat video url')
4536         # They are sorted from worst to best quality
4537         video_url = m_urls[-1].group('url')
4538
4539         return {'url':         video_url,
4540                 'id':          video_id,
4541                 'title':       video_title,
4542                 # Videos are actually flv not mp4
4543                 'ext':         'flv',
4544                 'thumbnail':   video_thumb,
4545                 'description': video_description,
4546                 }
4547
4548 def gen_extractors():
4549     """ Return a list of an instance of every supported extractor.
4550     The order does matter; the first extractor matched is the one handling the URL.
4551     """
4552     return [
4553         YoutubePlaylistIE(),
4554         YoutubeChannelIE(),
4555         YoutubeUserIE(),
4556         YoutubeSearchIE(),
4557         YoutubeIE(),
4558         MetacafeIE(),
4559         DailymotionIE(),
4560         GoogleSearchIE(),
4561         PhotobucketIE(),
4562         YahooIE(),
4563         YahooSearchIE(),
4564         DepositFilesIE(),
4565         FacebookIE(),
4566         BlipTVIE(),
4567         BlipTVUserIE(),
4568         VimeoIE(),
4569         MyVideoIE(),
4570         ComedyCentralIE(),
4571         EscapistIE(),
4572         CollegeHumorIE(),
4573         XVideosIE(),
4574         SoundcloudSetIE(),
4575         SoundcloudIE(),
4576         InfoQIE(),
4577         MixcloudIE(),
4578         StanfordOpenClassroomIE(),
4579         MTVIE(),
4580         YoukuIE(),
4581         XNXXIE(),
4582         YouJizzIE(),
4583         PornotubeIE(),
4584         YouPornIE(),
4585         GooglePlusIE(),
4586         ArteTvIE(),
4587         NBAIE(),
4588         WorldStarHipHopIE(),
4589         JustinTVIE(),
4590         FunnyOrDieIE(),
4591         SteamIE(),
4592         UstreamIE(),
4593         RBMARadioIE(),
4594         EightTracksIE(),
4595         KeekIE(),
4596         TEDIE(),
4597         MySpassIE(),
4598         SpiegelIE(),
4599         LiveLeakIE(),
4600         ARDIE(),
4601         ZDFIE(),
4602         TumblrIE(),
4603         BandcampIE(),
4604         RedTubeIE(),
4605         InaIE(),
4606         HowcastIE(),
4607         VineIE(),
4608         FlickrIE(),
4609         TeamcocoIE(),
4610         XHamsterIE(),
4611         HypemIE(),
4612         Vbox7IE(),
4613         GametrailersIE(),
4614         GenericIE()
4615     ]
4616
4617 def get_info_extractor(ie_name):
4618     """Returns the info extractor class with the given ie_name"""
4619     return globals()[ie_name+'IE']