youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 195         """
 196         Perform a regex search on the given string, using a single or a list of
 197         patterns returning the first matching group.
 198         In case of failure return a default value or raise a WARNING or a
 199         ExtractorError, depending on fatal, specifying the field name.
 200         """
 201         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 202             mobj = re.search(pattern, string, flags)
 203         else:
 204             for p in pattern:
 205                 mobj = re.search(p, string, flags)
 206                 if mobj: break
 207
 208         if sys.stderr.isatty() and os.name != 'nt':
 209             _name = u'\033[0;34m%s\033[0m' % name
 210         else:
 211             _name = name
 212
 213         if mobj:
 214             # return the first matching group
 215             return next(g for g in mobj.groups() if g is not None)
 216         elif default is not None:
 217             return default
 218         elif fatal:
 219             raise ExtractorError(u'Unable to extract %s' % _name)
 220         else:
 221             self._downloader.report_warning(u'unable to extract %s; '
 222                 u'please report this issue on GitHub.' % _name)
 223             return None
 224
 225     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 226         """
 227         Like _search_regex, but strips HTML tags and unescapes entities.
 228         """
 229         res = self._search_regex(pattern, string, name, default, fatal, flags)
 230         if res:
 231             return clean_html(res).strip()
 232         else:
 233             return res
 234
 235 class SearchInfoExtractor(InfoExtractor):
 236     """
 237     Base class for paged search queries extractors.
 238     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 239     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 240     """
 241
 242     @classmethod
 243     def _make_valid_url(cls):
 244         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 245
 246     @classmethod
 247     def suitable(cls, url):
 248         return re.match(cls._make_valid_url(), url) is not None
 249
 250     def _real_extract(self, query):
 251         mobj = re.match(self._make_valid_url(), query)
 252         if mobj is None:
 253             raise ExtractorError(u'Invalid search query "%s"' % query)
 254
 255         prefix = mobj.group('prefix')
 256         query = mobj.group('query')
 257         if prefix == '':
 258             return self._get_n_results(query, 1)
 259         elif prefix == 'all':
 260             return self._get_n_results(query, self._MAX_RESULTS)
 261         else:
 262             n = int(prefix)
 263             if n <= 0:
 264                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 265             elif n > self._MAX_RESULTS:
 266                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 267                 n = self._MAX_RESULTS
 268             return self._get_n_results(query, n)
 269
 270     def _get_n_results(self, query, n):
 271         """Get a specified number of results for a query"""
 272         raise NotImplementedError("This method must be implemented by sublclasses")
 273
 274
 275 class YoutubeIE(InfoExtractor):
 276     """Information extractor for youtube.com."""
 277
 278     _VALID_URL = r"""^
 279                      (
 280                          (?:https?://)?                                       # http(s):// (optional)
 281                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 282                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 283                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 284                          (?:                                                  # the various things that can precede the ID:
 285                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 286                              |(?:                                             # or the v= param in all its forms
 287                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 288                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 289                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 290                                  v=
 291                              )
 292                          )?                                                   # optional -> youtube.com/xxxx is OK
 293                      )?                                                       # all until now is optional -> you can pass the naked ID
 294                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 295                      (?(1).+)?                                                # if we found the ID, everything can follow
 296                      $"""
 297     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 298     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 299     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 300     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 301     _NETRC_MACHINE = 'youtube'
 302     # Listed in order of quality
 303     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 304     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 305     _video_extensions = {
 306         '13': '3gp',
 307         '17': 'mp4',
 308         '18': 'mp4',
 309         '22': 'mp4',
 310         '37': 'mp4',
 311         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 312         '43': 'webm',
 313         '44': 'webm',
 314         '45': 'webm',
 315         '46': 'webm',
 316     }
 317     _video_dimensions = {
 318         '5': '240x400',
 319         '6': '???',
 320         '13': '???',
 321         '17': '144x176',
 322         '18': '360x640',
 323         '22': '720x1280',
 324         '34': '360x640',
 325         '35': '480x854',
 326         '37': '1080x1920',
 327         '38': '3072x4096',
 328         '43': '360x640',
 329         '44': '480x854',
 330         '45': '720x1280',
 331         '46': '1080x1920',
 332     }
 333     IE_NAME = u'youtube'
 334
 335     @classmethod
 336     def suitable(cls, url):
 337         """Receives a URL and returns True if suitable for this IE."""
 338         if YoutubePlaylistIE.suitable(url): return False
 339         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 340
 341     def report_lang(self):
 342         """Report attempt to set language."""
 343         self.to_screen(u'Setting language')
 344
 345     def report_login(self):
 346         """Report attempt to log in."""
 347         self.to_screen(u'Logging in')
 348
 349     def report_video_webpage_download(self, video_id):
 350         """Report attempt to download video webpage."""
 351         self.to_screen(u'%s: Downloading video webpage' % video_id)
 352
 353     def report_video_info_webpage_download(self, video_id):
 354         """Report attempt to download video info webpage."""
 355         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 356
 357     def report_video_subtitles_download(self, video_id):
 358         """Report attempt to download video info webpage."""
 359         self.to_screen(u'%s: Checking available subtitles' % video_id)
 360
 361     def report_video_subtitles_request(self, video_id, sub_lang, format):
 362         """Report attempt to download video info webpage."""
 363         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 364
 365     def report_video_subtitles_available(self, video_id, sub_lang_list):
 366         """Report available subtitles."""
 367         sub_lang = ",".join(list(sub_lang_list.keys()))
 368         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 369
 370     def report_information_extraction(self, video_id):
 371         """Report attempt to extract video information."""
 372         self.to_screen(u'%s: Extracting video information' % video_id)
 373
 374     def report_unavailable_format(self, video_id, format):
 375         """Report extracted video URL."""
 376         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 377
 378     def report_rtmp_download(self):
 379         """Indicate the download will use the RTMP protocol."""
 380         self.to_screen(u'RTMP download detected')
 381
 382     @staticmethod
 383     def _decrypt_signature(s):
 384         """Decrypt the key the two subkeys must have a length of 43"""
 385         (a,b) = s.split('.')
 386         if len(a) != 43 or len(b) != 43:
 387             raise ExtractorError(u'Unable to decrypt signature, subkeys lengths not valid')
 388         b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40]
 389         a = a[-40:]
 390         s_dec = '.'.join((a,b))[::-1]
 391         return s_dec
 392
 393     def _get_available_subtitles(self, video_id):
 394         self.report_video_subtitles_download(video_id)
 395         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 396         try:
 397             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 398         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 399             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 400         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 401         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 402         if not sub_lang_list:
 403             return (u'video doesn\'t have subtitles', None)
 404         return sub_lang_list
 405
 406     def _list_available_subtitles(self, video_id):
 407         sub_lang_list = self._get_available_subtitles(video_id)
 408         self.report_video_subtitles_available(video_id, sub_lang_list)
 409
 410     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 411         """
 412         Return tuple:
 413         (error_message, sub_lang, sub)
 414         """
 415         self.report_video_subtitles_request(video_id, sub_lang, format)
 416         params = compat_urllib_parse.urlencode({
 417             'lang': sub_lang,
 418             'name': sub_name,
 419             'v': video_id,
 420             'fmt': format,
 421         })
 422         url = 'http://www.youtube.com/api/timedtext?' + params
 423         try:
 424             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 425         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 426             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 427         if not sub:
 428             return (u'Did not fetch video subtitles', None, None)
 429         return (None, sub_lang, sub)
 430
 431     def _request_automatic_caption(self, video_id, webpage):
 432         """We need the webpage for getting the captions url, pass it as an
 433            argument to speed up the process."""
 434         sub_lang = self._downloader.params.get('subtitleslang') or 'en'
 435         sub_format = self._downloader.params.get('subtitlesformat')
 436         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 437         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 438         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 439         if mobj is None:
 440             return [(err_msg, None, None)]
 441         player_config = json.loads(mobj.group(1))
 442         try:
 443             args = player_config[u'args']
 444             caption_url = args[u'ttsurl']
 445             timestamp = args[u'timestamp']
 446             params = compat_urllib_parse.urlencode({
 447                 'lang': 'en',
 448                 'tlang': sub_lang,
 449                 'fmt': sub_format,
 450                 'ts': timestamp,
 451                 'kind': 'asr',
 452             })
 453             subtitles_url = caption_url + '&' + params
 454             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 455             return [(None, sub_lang, sub)]
 456         except KeyError:
 457             return [(err_msg, None, None)]
 458
 459     def _extract_subtitle(self, video_id):
 460         """
 461         Return a list with a tuple:
 462         [(error_message, sub_lang, sub)]
 463         """
 464         sub_lang_list = self._get_available_subtitles(video_id)
 465         sub_format = self._downloader.params.get('subtitlesformat')
 466         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 467             return [(sub_lang_list[0], None, None)]
 468         if self._downloader.params.get('subtitleslang', False):
 469             sub_lang = self._downloader.params.get('subtitleslang')
 470         elif 'en' in sub_lang_list:
 471             sub_lang = 'en'
 472         else:
 473             sub_lang = list(sub_lang_list.keys())[0]
 474         if not sub_lang in sub_lang_list:
 475             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 476
 477         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 478         return [subtitle]
 479
 480     def _extract_all_subtitles(self, video_id):
 481         sub_lang_list = self._get_available_subtitles(video_id)
 482         sub_format = self._downloader.params.get('subtitlesformat')
 483         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 484             return [(sub_lang_list[0], None, None)]
 485         subtitles = []
 486         for sub_lang in sub_lang_list:
 487             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 488             subtitles.append(subtitle)
 489         return subtitles
 490
 491     def _print_formats(self, formats):
 492         print('Available formats:')
 493         for x in formats:
 494             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 495
 496     def _real_initialize(self):
 497         if self._downloader is None:
 498             return
 499
 500         username = None
 501         password = None
 502         downloader_params = self._downloader.params
 503
 504         # Attempt to use provided username and password or .netrc data
 505         if downloader_params.get('username', None) is not None:
 506             username = downloader_params['username']
 507             password = downloader_params['password']
 508         elif downloader_params.get('usenetrc', False):
 509             try:
 510                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 511                 if info is not None:
 512                     username = info[0]
 513                     password = info[2]
 514                 else:
 515                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 516             except (IOError, netrc.NetrcParseError) as err:
 517                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 518                 return
 519
 520         # Set language
 521         request = compat_urllib_request.Request(self._LANG_URL)
 522         try:
 523             self.report_lang()
 524             compat_urllib_request.urlopen(request).read()
 525         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 526             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 527             return
 528
 529         # No authentication to be performed
 530         if username is None:
 531             return
 532
 533         request = compat_urllib_request.Request(self._LOGIN_URL)
 534         try:
 535             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 536         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 537             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 538             return
 539
 540         galx = None
 541         dsh = None
 542         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 543         if match:
 544           galx = match.group(1)
 545
 546         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 547         if match:
 548           dsh = match.group(1)
 549
 550         # Log in
 551         login_form_strs = {
 552                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 553                 u'Email': username,
 554                 u'GALX': galx,
 555                 u'Passwd': password,
 556                 u'PersistentCookie': u'yes',
 557                 u'_utf8': u'霱',
 558                 u'bgresponse': u'js_disabled',
 559                 u'checkConnection': u'',
 560                 u'checkedDomains': u'youtube',
 561                 u'dnConn': u'',
 562                 u'dsh': dsh,
 563                 u'pstMsg': u'0',
 564                 u'rmShown': u'1',
 565                 u'secTok': u'',
 566                 u'signIn': u'Sign in',
 567                 u'timeStmp': u'',
 568                 u'service': u'youtube',
 569                 u'uilel': u'3',
 570                 u'hl': u'en_US',
 571         }
 572         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 573         # chokes on unicode
 574         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 575         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 576         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 577         try:
 578             self.report_login()
 579             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 580             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 581                 self._downloader.report_warning(u'unable to log in: bad username or password')
 582                 return
 583         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 584             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 585             return
 586
 587         # Confirm age
 588         age_form = {
 589                 'next_url':     '/',
 590                 'action_confirm':   'Confirm',
 591                 }
 592         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 593         try:
 594             self.report_age_confirmation()
 595             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 596         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 597             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 598
 599     def _extract_id(self, url):
 600         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 601         if mobj is None:
 602             raise ExtractorError(u'Invalid URL: %s' % url)
 603         video_id = mobj.group(2)
 604         return video_id
 605
 606     def _real_extract(self, url):
 607         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 608         mobj = re.search(self._NEXT_URL_RE, url)
 609         if mobj:
 610             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 611         video_id = self._extract_id(url)
 612
 613         # Get video webpage
 614         self.report_video_webpage_download(video_id)
 615         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 616         request = compat_urllib_request.Request(url)
 617         try:
 618             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 619         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 620             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 621
 622         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 623
 624         # Attempt to extract SWF player URL
 625         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 626         if mobj is not None:
 627             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 628         else:
 629             player_url = None
 630
 631         # Get video info
 632         self.report_video_info_webpage_download(video_id)
 633         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 634             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 635                     % (video_id, el_type))
 636             video_info_webpage = self._download_webpage(video_info_url, video_id,
 637                                     note=False,
 638                                     errnote='unable to download video info webpage')
 639             video_info = compat_parse_qs(video_info_webpage)
 640             if 'token' in video_info:
 641                 break
 642         if 'token' not in video_info:
 643             if 'reason' in video_info:
 644                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 645             else:
 646                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 647
 648         # Check for "rental" videos
 649         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 650             raise ExtractorError(u'"rental" videos not supported')
 651
 652         # Start extracting information
 653         self.report_information_extraction(video_id)
 654
 655         # uploader
 656         if 'author' not in video_info:
 657             raise ExtractorError(u'Unable to extract uploader name')
 658         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 659
 660         # uploader_id
 661         video_uploader_id = None
 662         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 663         if mobj is not None:
 664             video_uploader_id = mobj.group(1)
 665         else:
 666             self._downloader.report_warning(u'unable to extract uploader nickname')
 667
 668         # title
 669         if 'title' not in video_info:
 670             raise ExtractorError(u'Unable to extract video title')
 671         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 672
 673         # thumbnail image
 674         if 'thumbnail_url' not in video_info:
 675             self._downloader.report_warning(u'unable to extract video thumbnail')
 676             video_thumbnail = ''
 677         else:   # don't panic if we can't find it
 678             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 679
 680         # upload date
 681         upload_date = None
 682         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 683         if mobj is not None:
 684             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 685             upload_date = unified_strdate(upload_date)
 686
 687         # description
 688         video_description = get_element_by_id("eow-description", video_webpage)
 689         if video_description:
 690             video_description = clean_html(video_description)
 691         else:
 692             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 693             if fd_mobj:
 694                 video_description = unescapeHTML(fd_mobj.group(1))
 695             else:
 696                 video_description = u''
 697
 698         # subtitles
 699         video_subtitles = None
 700
 701         if self._downloader.params.get('writesubtitles', False):
 702             video_subtitles = self._extract_subtitle(video_id)
 703             if video_subtitles:
 704                 (sub_error, sub_lang, sub) = video_subtitles[0]
 705                 if sub_error:
 706                     # We try with the automatic captions
 707                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 708                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 709                     if sub is not None:
 710                         pass
 711                     else:
 712                         # We report the original error
 713                         self._downloader.report_warning(sub_error)
 714
 715         if self._downloader.params.get('allsubtitles', False):
 716             video_subtitles = self._extract_all_subtitles(video_id)
 717             for video_subtitle in video_subtitles:
 718                 (sub_error, sub_lang, sub) = video_subtitle
 719                 if sub_error:
 720                     self._downloader.report_warning(sub_error)
 721
 722         if self._downloader.params.get('listsubtitles', False):
 723             sub_lang_list = self._list_available_subtitles(video_id)
 724             return
 725
 726         if 'length_seconds' not in video_info:
 727             self._downloader.report_warning(u'unable to extract video duration')
 728             video_duration = ''
 729         else:
 730             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 731
 732         # token
 733         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 734
 735         # Decide which formats to download
 736         req_format = self._downloader.params.get('format', None)
 737
 738         try:
 739             mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
 740             info = json.loads(mobj.group(1))
 741             args = info['args']
 742             if args.get('ptk','') == 'vevo' or 'dashmpd':
 743                 # Vevo videos with encrypted signatures
 744                 self.to_screen(u'Vevo video detected.')
 745                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
 746         except ValueError:
 747             pass
 748
 749         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 750             self.report_rtmp_download()
 751             video_url_list = [(None, video_info['conn'][0])]
 752         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 753             url_map = {}
 754             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 755                 url_data = compat_parse_qs(url_data_str)
 756                 if 'itag' in url_data and 'url' in url_data:
 757                     url = url_data['url'][0]
 758                     if 'sig' in url_data:
 759                         url += '&signature=' + url_data['sig'][0]
 760                     if 's' in url_data:
 761                         signature = self._decrypt_signature(url_data['s'][0])
 762                         url += '&signature=' + signature
 763                     if 'ratebypass' not in url:
 764                         url += '&ratebypass=yes'
 765                     url_map[url_data['itag'][0]] = url
 766
 767             format_limit = self._downloader.params.get('format_limit', None)
 768             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 769             if format_limit is not None and format_limit in available_formats:
 770                 format_list = available_formats[available_formats.index(format_limit):]
 771             else:
 772                 format_list = available_formats
 773             existing_formats = [x for x in format_list if x in url_map]
 774             if len(existing_formats) == 0:
 775                 raise ExtractorError(u'no known formats available for video')
 776             if self._downloader.params.get('listformats', None):
 777                 self._print_formats(existing_formats)
 778                 return
 779             if req_format is None or req_format == 'best':
 780                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 781             elif req_format == 'worst':
 782                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 783             elif req_format in ('-1', 'all'):
 784                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 785             else:
 786                 # Specific formats. We pick the first in a slash-delimeted sequence.
 787                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 788                 req_formats = req_format.split('/')
 789                 video_url_list = None
 790                 for rf in req_formats:
 791                     if rf in url_map:
 792                         video_url_list = [(rf, url_map[rf])]
 793                         break
 794                 if video_url_list is None:
 795                     raise ExtractorError(u'requested format not available')
 796         else:
 797             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 798
 799         results = []
 800         for format_param, video_real_url in video_url_list:
 801             # Extension
 802             video_extension = self._video_extensions.get(format_param, 'flv')
 803
 804             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 805                                               self._video_dimensions.get(format_param, '???'))
 806
 807             results.append({
 808                 'id':       video_id,
 809                 'url':      video_real_url,
 810                 'uploader': video_uploader,
 811                 'uploader_id': video_uploader_id,
 812                 'upload_date':  upload_date,
 813                 'title':    video_title,
 814                 'ext':      video_extension,
 815                 'format':   video_format,
 816                 'thumbnail':    video_thumbnail,
 817                 'description':  video_description,
 818                 'player_url':   player_url,
 819                 'subtitles':    video_subtitles,
 820                 'duration':     video_duration
 821             })
 822         return results
 823
 824
 825 class MetacafeIE(InfoExtractor):
 826     """Information Extractor for metacafe.com."""
 827
 828     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 829     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 830     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 831     IE_NAME = u'metacafe'
 832
 833     def report_disclaimer(self):
 834         """Report disclaimer retrieval."""
 835         self.to_screen(u'Retrieving disclaimer')
 836
 837     def _real_initialize(self):
 838         # Retrieve disclaimer
 839         request = compat_urllib_request.Request(self._DISCLAIMER)
 840         try:
 841             self.report_disclaimer()
 842             disclaimer = compat_urllib_request.urlopen(request).read()
 843         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 844             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 845
 846         # Confirm age
 847         disclaimer_form = {
 848             'filters': '0',
 849             'submit': "Continue - I'm over 18",
 850             }
 851         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 852         try:
 853             self.report_age_confirmation()
 854             disclaimer = compat_urllib_request.urlopen(request).read()
 855         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 856             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 857
 858     def _real_extract(self, url):
 859         # Extract id and simplified title from URL
 860         mobj = re.match(self._VALID_URL, url)
 861         if mobj is None:
 862             raise ExtractorError(u'Invalid URL: %s' % url)
 863
 864         video_id = mobj.group(1)
 865
 866         # Check if video comes from YouTube
 867         mobj2 = re.match(r'^yt-(.*)$', video_id)
 868         if mobj2 is not None:
 869             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 870
 871         # Retrieve video webpage to extract further information
 872         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 873
 874         # Extract URL, uploader and title from webpage
 875         self.report_extraction(video_id)
 876         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 877         if mobj is not None:
 878             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 879             video_extension = mediaURL[-3:]
 880
 881             # Extract gdaKey if available
 882             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 883             if mobj is None:
 884                 video_url = mediaURL
 885             else:
 886                 gdaKey = mobj.group(1)
 887                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 888         else:
 889             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 890             if mobj is None:
 891                 raise ExtractorError(u'Unable to extract media URL')
 892             vardict = compat_parse_qs(mobj.group(1))
 893             if 'mediaData' not in vardict:
 894                 raise ExtractorError(u'Unable to extract media URL')
 895             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 896             if mobj is None:
 897                 raise ExtractorError(u'Unable to extract media URL')
 898             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 899             video_extension = mediaURL[-3:]
 900             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 901
 902         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 903         if mobj is None:
 904             raise ExtractorError(u'Unable to extract title')
 905         video_title = mobj.group(1).decode('utf-8')
 906
 907         mobj = re.search(r'submitter=(.*?);', webpage)
 908         if mobj is None:
 909             raise ExtractorError(u'Unable to extract uploader nickname')
 910         video_uploader = mobj.group(1)
 911
 912         return [{
 913             'id':       video_id.decode('utf-8'),
 914             'url':      video_url.decode('utf-8'),
 915             'uploader': video_uploader.decode('utf-8'),
 916             'upload_date':  None,
 917             'title':    video_title,
 918             'ext':      video_extension.decode('utf-8'),
 919         }]
 920
 921 class DailymotionIE(InfoExtractor):
 922     """Information Extractor for Dailymotion"""
 923
 924     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 925     IE_NAME = u'dailymotion'
 926
 927     def _real_extract(self, url):
 928         # Extract id and simplified title from URL
 929         mobj = re.match(self._VALID_URL, url)
 930         if mobj is None:
 931             raise ExtractorError(u'Invalid URL: %s' % url)
 932
 933         video_id = mobj.group(1).split('_')[0].split('?')[0]
 934
 935         video_extension = 'mp4'
 936
 937         # Retrieve video webpage to extract further information
 938         request = compat_urllib_request.Request(url)
 939         request.add_header('Cookie', 'family_filter=off')
 940         webpage = self._download_webpage(request, video_id)
 941
 942         # Extract URL, uploader and title from webpage
 943         self.report_extraction(video_id)
 944         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 945         if mobj is None:
 946             raise ExtractorError(u'Unable to extract media URL')
 947         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 948
 949         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 950             if key in flashvars:
 951                 max_quality = key
 952                 self.to_screen(u'Using %s' % key)
 953                 break
 954         else:
 955             raise ExtractorError(u'Unable to extract video URL')
 956
 957         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 958         if mobj is None:
 959             raise ExtractorError(u'Unable to extract video URL')
 960
 961         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 962
 963         # TODO: support choosing qualities
 964
 965         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 966         if mobj is None:
 967             raise ExtractorError(u'Unable to extract title')
 968         video_title = unescapeHTML(mobj.group('title'))
 969
 970         video_uploader = None
 971         video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
 972                                              # Looking for official user
 973                                              r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
 974                                             webpage, 'video uploader')
 975
 976         video_upload_date = None
 977         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 978         if mobj is not None:
 979             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 980
 981         return [{
 982             'id':       video_id,
 983             'url':      video_url,
 984             'uploader': video_uploader,
 985             'upload_date':  video_upload_date,
 986             'title':    video_title,
 987             'ext':      video_extension,
 988         }]
 989
 990
 991 class PhotobucketIE(InfoExtractor):
 992     """Information extractor for photobucket.com."""
 993
 994     # TODO: the original _VALID_URL was:
 995     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 996     # Check if it's necessary to keep the old extracion process
 997     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 998     IE_NAME = u'photobucket'
 999
1000     def _real_extract(self, url):
1001         # Extract id from URL
1002         mobj = re.match(self._VALID_URL, url)
1003         if mobj is None:
1004             raise ExtractorError(u'Invalid URL: %s' % url)
1005
1006         video_id = mobj.group('id')
1007
1008         video_extension = mobj.group('ext')
1009
1010         # Retrieve video webpage to extract further information
1011         webpage = self._download_webpage(url, video_id)
1012
1013         # Extract URL, uploader, and title from webpage
1014         self.report_extraction(video_id)
1015         # We try first by looking the javascript code:
1016         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
1017         if mobj is not None:
1018             info = json.loads(mobj.group('json'))
1019             return [{
1020                 'id':       video_id,
1021                 'url':      info[u'downloadUrl'],
1022                 'uploader': info[u'username'],
1023                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
1024                 'title':    info[u'title'],
1025                 'ext':      video_extension,
1026                 'thumbnail': info[u'thumbUrl'],
1027             }]
1028
1029         # We try looking in other parts of the webpage
1030         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1031             webpage, u'video URL')
1032
1033         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1034         if mobj is None:
1035             raise ExtractorError(u'Unable to extract title')
1036         video_title = mobj.group(1).decode('utf-8')
1037         video_uploader = mobj.group(2).decode('utf-8')
1038
1039         return [{
1040             'id':       video_id.decode('utf-8'),
1041             'url':      video_url.decode('utf-8'),
1042             'uploader': video_uploader,
1043             'upload_date':  None,
1044             'title':    video_title,
1045             'ext':      video_extension.decode('utf-8'),
1046         }]
1047
1048
1049 class YahooIE(InfoExtractor):
1050     """Information extractor for screen.yahoo.com."""
1051     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1052
1053     def _real_extract(self, url):
1054         mobj = re.match(self._VALID_URL, url)
1055         if mobj is None:
1056             raise ExtractorError(u'Invalid URL: %s' % url)
1057         video_id = mobj.group('id')
1058         webpage = self._download_webpage(url, video_id)
1059         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1060
1061         if m_id is None:
1062             # TODO: Check which url parameters are required
1063             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1064             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1065             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1066                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1067                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1068                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1069                         '''
1070             self.report_extraction(video_id)
1071             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1072             if m_info is None:
1073                 raise ExtractorError(u'Unable to extract video info')
1074             video_title = m_info.group('title')
1075             video_description = m_info.group('description')
1076             video_thumb = m_info.group('thumb')
1077             video_date = m_info.group('date')
1078             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1079
1080             # TODO: Find a way to get mp4 videos
1081             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1082             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1083             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1084             video_url = m_rest.group('url')
1085             video_path = m_rest.group('path')
1086             if m_rest is None:
1087                 raise ExtractorError(u'Unable to extract video url')
1088
1089         else: # We have to use a different method if another id is defined
1090             long_id = m_id.group('new_id')
1091             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1092             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1093             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1094             info = json.loads(json_str)
1095             res = info[u'query'][u'results'][u'mediaObj'][0]
1096             stream = res[u'streams'][0]
1097             video_path = stream[u'path']
1098             video_url = stream[u'host']
1099             meta = res[u'meta']
1100             video_title = meta[u'title']
1101             video_description = meta[u'description']
1102             video_thumb = meta[u'thumbnail']
1103             video_date = None # I can't find it
1104
1105         info_dict = {
1106                      'id': video_id,
1107                      'url': video_url,
1108                      'play_path': video_path,
1109                      'title':video_title,
1110                      'description': video_description,
1111                      'thumbnail': video_thumb,
1112                      'upload_date': video_date,
1113                      'ext': 'flv',
1114                      }
1115         return info_dict
1116
1117 class VimeoIE(InfoExtractor):
1118     """Information extractor for vimeo.com."""
1119
1120     # _VALID_URL matches Vimeo URLs
1121     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1122     IE_NAME = u'vimeo'
1123
1124     def _verify_video_password(self, url, video_id, webpage):
1125         password = self._downloader.params.get('password', None)
1126         if password is None:
1127             raise ExtractorError(u'This video is protected by a password, use the --password option')
1128         token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
1129         data = compat_urllib_parse.urlencode({'password': password,
1130                                               'token': token})
1131         # I didn't manage to use the password with https
1132         if url.startswith('https'):
1133             pass_url = url.replace('https','http')
1134         else:
1135             pass_url = url
1136         password_request = compat_urllib_request.Request(pass_url+'/password', data)
1137         password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1138         password_request.add_header('Cookie', 'xsrft=%s' % token)
1139         pass_web = self._download_webpage(password_request, video_id,
1140                                           u'Verifying the password',
1141                                           u'Wrong password')
1142
1143     def _real_extract(self, url, new_video=True):
1144         # Extract ID from URL
1145         mobj = re.match(self._VALID_URL, url)
1146         if mobj is None:
1147             raise ExtractorError(u'Invalid URL: %s' % url)
1148
1149         video_id = mobj.group('id')
1150         if not mobj.group('proto'):
1151             url = 'https://' + url
1152         if mobj.group('direct_link') or mobj.group('pro'):
1153             url = 'https://vimeo.com/' + video_id
1154
1155         # Retrieve video webpage to extract further information
1156         request = compat_urllib_request.Request(url, None, std_headers)
1157         webpage = self._download_webpage(request, video_id)
1158
1159         # Now we begin extracting as much information as we can from what we
1160         # retrieved. First we extract the information common to all extractors,
1161         # and latter we extract those that are Vimeo specific.
1162         self.report_extraction(video_id)
1163
1164         # Extract the config JSON
1165         try:
1166             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1167             config = json.loads(config)
1168         except:
1169             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1170                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1171
1172             if re.search('If so please provide the correct password.', webpage):
1173                 self._verify_video_password(url, video_id, webpage)
1174                 return self._real_extract(url)
1175             else:
1176                 raise ExtractorError(u'Unable to extract info section')
1177
1178         # Extract title
1179         video_title = config["video"]["title"]
1180
1181         # Extract uploader and uploader_id
1182         video_uploader = config["video"]["owner"]["name"]
1183         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1184
1185         # Extract video thumbnail
1186         video_thumbnail = config["video"]["thumbnail"]
1187
1188         # Extract video description
1189         video_description = get_element_by_attribute("itemprop", "description", webpage)
1190         if video_description: video_description = clean_html(video_description)
1191         else: video_description = u''
1192
1193         # Extract upload date
1194         video_upload_date = None
1195         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1196         if mobj is not None:
1197             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1198
1199         # Vimeo specific: extract request signature and timestamp
1200         sig = config['request']['signature']
1201         timestamp = config['request']['timestamp']
1202
1203         # Vimeo specific: extract video codec and quality information
1204         # First consider quality, then codecs, then take everything
1205         # TODO bind to format param
1206         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1207         files = { 'hd': [], 'sd': [], 'other': []}
1208         for codec_name, codec_extension in codecs:
1209             if codec_name in config["video"]["files"]:
1210                 if 'hd' in config["video"]["files"][codec_name]:
1211                     files['hd'].append((codec_name, codec_extension, 'hd'))
1212                 elif 'sd' in config["video"]["files"][codec_name]:
1213                     files['sd'].append((codec_name, codec_extension, 'sd'))
1214                 else:
1215                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1216
1217         for quality in ('hd', 'sd', 'other'):
1218             if len(files[quality]) > 0:
1219                 video_quality = files[quality][0][2]
1220                 video_codec = files[quality][0][0]
1221                 video_extension = files[quality][0][1]
1222                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1223                 break
1224         else:
1225             raise ExtractorError(u'No known codec found')
1226
1227         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1228                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1229
1230         return [{
1231             'id':       video_id,
1232             'url':      video_url,
1233             'uploader': video_uploader,
1234             'uploader_id': video_uploader_id,
1235             'upload_date':  video_upload_date,
1236             'title':    video_title,
1237             'ext':      video_extension,
1238             'thumbnail':    video_thumbnail,
1239             'description':  video_description,
1240         }]
1241
1242
1243 class ArteTvIE(InfoExtractor):
1244     """arte.tv information extractor."""
1245
1246     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1247     _LIVE_URL = r'index-[0-9]+\.html$'
1248
1249     IE_NAME = u'arte.tv'
1250
1251     def fetch_webpage(self, url):
1252         request = compat_urllib_request.Request(url)
1253         try:
1254             self.report_download_webpage(url)
1255             webpage = compat_urllib_request.urlopen(request).read()
1256         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1257             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1258         except ValueError as err:
1259             raise ExtractorError(u'Invalid URL: %s' % url)
1260         return webpage
1261
1262     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1263         page = self.fetch_webpage(url)
1264         mobj = re.search(regex, page, regexFlags)
1265         info = {}
1266
1267         if mobj is None:
1268             raise ExtractorError(u'Invalid URL: %s' % url)
1269
1270         for (i, key, err) in matchTuples:
1271             if mobj.group(i) is None:
1272                 raise ExtractorError(err)
1273             else:
1274                 info[key] = mobj.group(i)
1275
1276         return info
1277
1278     def extractLiveStream(self, url):
1279         video_lang = url.split('/')[-4]
1280         info = self.grep_webpage(
1281             url,
1282             r'src="(.*?/videothek_js.*?\.js)',
1283             0,
1284             [
1285                 (1, 'url', u'Invalid URL: %s' % url)
1286             ]
1287         )
1288         http_host = url.split('/')[2]
1289         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1290         info = self.grep_webpage(
1291             next_url,
1292             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1293                 '(http://.*?\.swf).*?' +
1294                 '(rtmp://.*?)\'',
1295             re.DOTALL,
1296             [
1297                 (1, 'path',   u'could not extract video path: %s' % url),
1298                 (2, 'player', u'could not extract video player: %s' % url),
1299                 (3, 'url',    u'could not extract video url: %s' % url)
1300             ]
1301         )
1302         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1303
1304     def extractPlus7Stream(self, url):
1305         video_lang = url.split('/')[-3]
1306         info = self.grep_webpage(
1307             url,
1308             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1309             0,
1310             [
1311                 (1, 'url', u'Invalid URL: %s' % url)
1312             ]
1313         )
1314         next_url = compat_urllib_parse.unquote(info.get('url'))
1315         info = self.grep_webpage(
1316             next_url,
1317             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1318             0,
1319             [
1320                 (1, 'url', u'Could not find <video> tag: %s' % url)
1321             ]
1322         )
1323         next_url = compat_urllib_parse.unquote(info.get('url'))
1324
1325         info = self.grep_webpage(
1326             next_url,
1327             r'<video id="(.*?)".*?>.*?' +
1328                 '<name>(.*?)</name>.*?' +
1329                 '<dateVideo>(.*?)</dateVideo>.*?' +
1330                 '<url quality="hd">(.*?)</url>',
1331             re.DOTALL,
1332             [
1333                 (1, 'id',    u'could not extract video id: %s' % url),
1334                 (2, 'title', u'could not extract video title: %s' % url),
1335                 (3, 'date',  u'could not extract video date: %s' % url),
1336                 (4, 'url',   u'could not extract video url: %s' % url)
1337             ]
1338         )
1339
1340         return {
1341             'id':           info.get('id'),
1342             'url':          compat_urllib_parse.unquote(info.get('url')),
1343             'uploader':     u'arte.tv',
1344             'upload_date':  unified_strdate(info.get('date')),
1345             'title':        info.get('title').decode('utf-8'),
1346             'ext':          u'mp4',
1347             'format':       u'NA',
1348             'player_url':   None,
1349         }
1350
1351     def _real_extract(self, url):
1352         video_id = url.split('/')[-1]
1353         self.report_extraction(video_id)
1354
1355         if re.search(self._LIVE_URL, video_id) is not None:
1356             self.extractLiveStream(url)
1357             return
1358         else:
1359             info = self.extractPlus7Stream(url)
1360
1361         return [info]
1362
1363
1364 class GenericIE(InfoExtractor):
1365     """Generic last-resort information extractor."""
1366
1367     _VALID_URL = r'.*'
1368     IE_NAME = u'generic'
1369
1370     def report_download_webpage(self, video_id):
1371         """Report webpage download."""
1372         if not self._downloader.params.get('test', False):
1373             self._downloader.report_warning(u'Falling back on generic information extractor.')
1374         super(GenericIE, self).report_download_webpage(video_id)
1375
1376     def report_following_redirect(self, new_url):
1377         """Report information extraction."""
1378         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1379
1380     def _test_redirect(self, url):
1381         """Check if it is a redirect, like url shorteners, in case return the new url."""
1382         class HeadRequest(compat_urllib_request.Request):
1383             def get_method(self):
1384                 return "HEAD"
1385
1386         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1387             """
1388             Subclass the HTTPRedirectHandler to make it use our
1389             HeadRequest also on the redirected URL
1390             """
1391             def redirect_request(self, req, fp, code, msg, headers, newurl):
1392                 if code in (301, 302, 303, 307):
1393                     newurl = newurl.replace(' ', '%20')
1394                     newheaders = dict((k,v) for k,v in req.headers.items()
1395                                       if k.lower() not in ("content-length", "content-type"))
1396                     return HeadRequest(newurl,
1397                                        headers=newheaders,
1398                                        origin_req_host=req.get_origin_req_host(),
1399                                        unverifiable=True)
1400                 else:
1401                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1402
1403         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1404             """
1405             Fallback to GET if HEAD is not allowed (405 HTTP error)
1406             """
1407             def http_error_405(self, req, fp, code, msg, headers):
1408                 fp.read()
1409                 fp.close()
1410
1411                 newheaders = dict((k,v) for k,v in req.headers.items()
1412                                   if k.lower() not in ("content-length", "content-type"))
1413                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1414                                                  headers=newheaders,
1415                                                  origin_req_host=req.get_origin_req_host(),
1416                                                  unverifiable=True))
1417
1418         # Build our opener
1419         opener = compat_urllib_request.OpenerDirector()
1420         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1421                         HTTPMethodFallback, HEADRedirectHandler,
1422                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1423             opener.add_handler(handler())
1424
1425         response = opener.open(HeadRequest(url))
1426         if response is None:
1427             raise ExtractorError(u'Invalid URL protocol')
1428         new_url = response.geturl()
1429
1430         if url == new_url:
1431             return False
1432
1433         self.report_following_redirect(new_url)
1434         return new_url
1435
1436     def _real_extract(self, url):
1437         new_url = self._test_redirect(url)
1438         if new_url: return [self.url_result(new_url)]
1439
1440         video_id = url.split('/')[-1]
1441         try:
1442             webpage = self._download_webpage(url, video_id)
1443         except ValueError as err:
1444             # since this is the last-resort InfoExtractor, if
1445             # this error is thrown, it'll be thrown here
1446             raise ExtractorError(u'Invalid URL: %s' % url)
1447
1448         self.report_extraction(video_id)
1449         # Start with something easy: JW Player in SWFObject
1450         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1451         if mobj is None:
1452             # Broaden the search a little bit
1453             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1454         if mobj is None:
1455             # Broaden the search a little bit: JWPlayer JS loader
1456             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1457         if mobj is None:
1458             # Try to find twitter cards info
1459             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1460         if mobj is None:
1461             # We look for Open Graph info:
1462             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1463             m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1464             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1465             if m_video_type is not None:
1466                 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
1467         if mobj is None:
1468             raise ExtractorError(u'Invalid URL: %s' % url)
1469
1470         # It's possible that one of the regexes
1471         # matched, but returned an empty group:
1472         if mobj.group(1) is None:
1473             raise ExtractorError(u'Invalid URL: %s' % url)
1474
1475         video_url = compat_urllib_parse.unquote(mobj.group(1))
1476         video_id = os.path.basename(video_url)
1477
1478         # here's a fun little line of code for you:
1479         video_extension = os.path.splitext(video_id)[1][1:]
1480         video_id = os.path.splitext(video_id)[0]
1481
1482         # it's tempting to parse this further, but you would
1483         # have to take into account all the variations like
1484         #   Video Title - Site Name
1485         #   Site Name | Video Title
1486         #   Video Title - Tagline | Site Name
1487         # and so on and so forth; it's just not practical
1488         video_title = self._html_search_regex(r'<title>(.*)</title>',
1489             webpage, u'video title')
1490
1491         # video uploader is domain name
1492         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1493             url, u'video uploader')
1494
1495         return [{
1496             'id':       video_id,
1497             'url':      video_url,
1498             'uploader': video_uploader,
1499             'upload_date':  None,
1500             'title':    video_title,
1501             'ext':      video_extension,
1502         }]
1503
1504
1505 class YoutubeSearchIE(SearchInfoExtractor):
1506     """Information Extractor for YouTube search queries."""
1507     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1508     _MAX_RESULTS = 1000
1509     IE_NAME = u'youtube:search'
1510     _SEARCH_KEY = 'ytsearch'
1511
1512     def report_download_page(self, query, pagenum):
1513         """Report attempt to download search page with given number."""
1514         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1515
1516     def _get_n_results(self, query, n):
1517         """Get a specified number of results for a query"""
1518
1519         video_ids = []
1520         pagenum = 0
1521         limit = n
1522
1523         while (50 * pagenum) < limit:
1524             self.report_download_page(query, pagenum+1)
1525             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1526             request = compat_urllib_request.Request(result_url)
1527             try:
1528                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1529             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1530                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1531             api_response = json.loads(data)['data']
1532
1533             if not 'items' in api_response:
1534                 raise ExtractorError(u'[youtube] No video results')
1535
1536             new_ids = list(video['id'] for video in api_response['items'])
1537             video_ids += new_ids
1538
1539             limit = min(n, api_response['totalItems'])
1540             pagenum += 1
1541
1542         if len(video_ids) > n:
1543             video_ids = video_ids[:n]
1544         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1545         return self.playlist_result(videos, query)
1546
1547
1548 class GoogleSearchIE(SearchInfoExtractor):
1549     """Information Extractor for Google Video search queries."""
1550     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1551     _MAX_RESULTS = 1000
1552     IE_NAME = u'video.google:search'
1553     _SEARCH_KEY = 'gvsearch'
1554
1555     def _get_n_results(self, query, n):
1556         """Get a specified number of results for a query"""
1557
1558         res = {
1559             '_type': 'playlist',
1560             'id': query,
1561             'entries': []
1562         }
1563
1564         for pagenum in itertools.count(1):
1565             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1566             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1567                                              note='Downloading result page ' + str(pagenum))
1568
1569             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1570                 e = {
1571                     '_type': 'url',
1572                     'url': mobj.group(1)
1573                 }
1574                 res['entries'].append(e)
1575
1576             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1577                 return res
1578
1579 class YahooSearchIE(SearchInfoExtractor):
1580     """Information Extractor for Yahoo! Video search queries."""
1581
1582     _MAX_RESULTS = 1000
1583     IE_NAME = u'screen.yahoo:search'
1584     _SEARCH_KEY = 'yvsearch'
1585
1586     def _get_n_results(self, query, n):
1587         """Get a specified number of results for a query"""
1588
1589         res = {
1590             '_type': 'playlist',
1591             'id': query,
1592             'entries': []
1593         }
1594         for pagenum in itertools.count(0):
1595             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1596             webpage = self._download_webpage(result_url, query,
1597                                              note='Downloading results page '+str(pagenum+1))
1598             info = json.loads(webpage)
1599             m = info[u'm']
1600             results = info[u'results']
1601
1602             for (i, r) in enumerate(results):
1603                 if (pagenum * 30) +i >= n:
1604                     break
1605                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1606                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1607                 res['entries'].append(e)
1608             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1609                 break
1610
1611         return res
1612
1613
1614 class YoutubePlaylistIE(InfoExtractor):
1615     """Information Extractor for YouTube playlists."""
1616
1617     _VALID_URL = r"""(?:
1618                         (?:https?://)?
1619                         (?:\w+\.)?
1620                         youtube\.com/
1621                         (?:
1622                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1623                            \? (?:.*?&)*? (?:p|a|list)=
1624                         |  p/
1625                         )
1626                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1627                         .*
1628                      |
1629                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1630                      )"""
1631     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1632     _MAX_RESULTS = 50
1633     IE_NAME = u'youtube:playlist'
1634
1635     @classmethod
1636     def suitable(cls, url):
1637         """Receives a URL and returns True if suitable for this IE."""
1638         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1639
1640     def _real_extract(self, url):
1641         # Extract playlist id
1642         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1643         if mobj is None:
1644             raise ExtractorError(u'Invalid URL: %s' % url)
1645
1646         # Download playlist videos from API
1647         playlist_id = mobj.group(1) or mobj.group(2)
1648         page_num = 1
1649         videos = []
1650
1651         while True:
1652             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1653             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1654
1655             try:
1656                 response = json.loads(page)
1657             except ValueError as err:
1658                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1659
1660             if 'feed' not in response:
1661                 raise ExtractorError(u'Got a malformed response from YouTube API')
1662             playlist_title = response['feed']['title']['$t']
1663             if 'entry' not in response['feed']:
1664                 # Number of videos is a multiple of self._MAX_RESULTS
1665                 break
1666
1667             for entry in response['feed']['entry']:
1668                 index = entry['yt$position']['$t']
1669                 if 'media$group' in entry and 'media$player' in entry['media$group']:
1670                     videos.append((index, entry['media$group']['media$player']['url']))
1671
1672             if len(response['feed']['entry']) < self._MAX_RESULTS:
1673                 break
1674             page_num += 1
1675
1676         videos = [v[1] for v in sorted(videos)]
1677
1678         url_results = [self.url_result(url, 'Youtube') for url in videos]
1679         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1680
1681
1682 class YoutubeChannelIE(InfoExtractor):
1683     """Information Extractor for YouTube channels."""
1684
1685     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1686     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1687     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1688     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1689     IE_NAME = u'youtube:channel'
1690
1691     def extract_videos_from_page(self, page):
1692         ids_in_page = []
1693         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1694             if mobj.group(1) not in ids_in_page:
1695                 ids_in_page.append(mobj.group(1))
1696         return ids_in_page
1697
1698     def _real_extract(self, url):
1699         # Extract channel id
1700         mobj = re.match(self._VALID_URL, url)
1701         if mobj is None:
1702             raise ExtractorError(u'Invalid URL: %s' % url)
1703
1704         # Download channel page
1705         channel_id = mobj.group(1)
1706         video_ids = []
1707         pagenum = 1
1708
1709         url = self._TEMPLATE_URL % (channel_id, pagenum)
1710         page = self._download_webpage(url, channel_id,
1711                                       u'Downloading page #%s' % pagenum)
1712
1713         # Extract video identifiers
1714         ids_in_page = self.extract_videos_from_page(page)
1715         video_ids.extend(ids_in_page)
1716
1717         # Download any subsequent channel pages using the json-based channel_ajax query
1718         if self._MORE_PAGES_INDICATOR in page:
1719             while True:
1720                 pagenum = pagenum + 1
1721
1722                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1723                 page = self._download_webpage(url, channel_id,
1724                                               u'Downloading page #%s' % pagenum)
1725
1726                 page = json.loads(page)
1727
1728                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1729                 video_ids.extend(ids_in_page)
1730
1731                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1732                     break
1733
1734         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1735
1736         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1737         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1738         return [self.playlist_result(url_entries, channel_id)]
1739
1740
1741 class YoutubeUserIE(InfoExtractor):
1742     """Information Extractor for YouTube users."""
1743
1744     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1745     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1746     _GDATA_PAGE_SIZE = 50
1747     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1748     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1749     IE_NAME = u'youtube:user'
1750
1751     def _real_extract(self, url):
1752         # Extract username
1753         mobj = re.match(self._VALID_URL, url)
1754         if mobj is None:
1755             raise ExtractorError(u'Invalid URL: %s' % url)
1756
1757         username = mobj.group(1)
1758
1759         # Download video ids using YouTube Data API. Result size per
1760         # query is limited (currently to 50 videos) so we need to query
1761         # page by page until there are no video ids - it means we got
1762         # all of them.
1763
1764         video_ids = []
1765         pagenum = 0
1766
1767         while True:
1768             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1769
1770             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1771             page = self._download_webpage(gdata_url, username,
1772                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1773
1774             # Extract video identifiers
1775             ids_in_page = []
1776
1777             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1778                 if mobj.group(1) not in ids_in_page:
1779                     ids_in_page.append(mobj.group(1))
1780
1781             video_ids.extend(ids_in_page)
1782
1783             # A little optimization - if current page is not
1784             # "full", ie. does not contain PAGE_SIZE video ids then
1785             # we can assume that this page is the last one - there
1786             # are no more ids on further pages - no need to query
1787             # again.
1788
1789             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1790                 break
1791
1792             pagenum += 1
1793
1794         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1795         url_results = [self.url_result(url, 'Youtube') for url in urls]
1796         return [self.playlist_result(url_results, playlist_title = username)]
1797
1798
1799 class BlipTVUserIE(InfoExtractor):
1800     """Information Extractor for blip.tv users."""
1801
1802     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1803     _PAGE_SIZE = 12
1804     IE_NAME = u'blip.tv:user'
1805
1806     def _real_extract(self, url):
1807         # Extract username
1808         mobj = re.match(self._VALID_URL, url)
1809         if mobj is None:
1810             raise ExtractorError(u'Invalid URL: %s' % url)
1811
1812         username = mobj.group(1)
1813
1814         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1815
1816         page = self._download_webpage(url, username, u'Downloading user page')
1817         mobj = re.search(r'data-users-id="([^"]+)"', page)
1818         page_base = page_base % mobj.group(1)
1819
1820
1821         # Download video ids using BlipTV Ajax calls. Result size per
1822         # query is limited (currently to 12 videos) so we need to query
1823         # page by page until there are no video ids - it means we got
1824         # all of them.
1825
1826         video_ids = []
1827         pagenum = 1
1828
1829         while True:
1830             url = page_base + "&page=" + str(pagenum)
1831             page = self._download_webpage(url, username,
1832                                           u'Downloading video ids from page %d' % pagenum)
1833
1834             # Extract video identifiers
1835             ids_in_page = []
1836
1837             for mobj in re.finditer(r'href="/([^"]+)"', page):
1838                 if mobj.group(1) not in ids_in_page:
1839                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1840
1841             video_ids.extend(ids_in_page)
1842
1843             # A little optimization - if current page is not
1844             # "full", ie. does not contain PAGE_SIZE video ids then
1845             # we can assume that this page is the last one - there
1846             # are no more ids on further pages - no need to query
1847             # again.
1848
1849             if len(ids_in_page) < self._PAGE_SIZE:
1850                 break
1851
1852             pagenum += 1
1853
1854         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1855         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1856         return [self.playlist_result(url_entries, playlist_title = username)]
1857
1858
1859 class DepositFilesIE(InfoExtractor):
1860     """Information extractor for depositfiles.com"""
1861
1862     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1863
1864     def _real_extract(self, url):
1865         file_id = url.split('/')[-1]
1866         # Rebuild url in english locale
1867         url = 'http://depositfiles.com/en/files/' + file_id
1868
1869         # Retrieve file webpage with 'Free download' button pressed
1870         free_download_indication = { 'gateway_result' : '1' }
1871         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1872         try:
1873             self.report_download_webpage(file_id)
1874             webpage = compat_urllib_request.urlopen(request).read()
1875         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1876             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1877
1878         # Search for the real file URL
1879         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1880         if (mobj is None) or (mobj.group(1) is None):
1881             # Try to figure out reason of the error.
1882             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1883             if (mobj is not None) and (mobj.group(1) is not None):
1884                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1885                 raise ExtractorError(u'%s' % restriction_message)
1886             else:
1887                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1888
1889         file_url = mobj.group(1)
1890         file_extension = os.path.splitext(file_url)[1][1:]
1891
1892         # Search for file title
1893         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1894
1895         return [{
1896             'id':       file_id.decode('utf-8'),
1897             'url':      file_url.decode('utf-8'),
1898             'uploader': None,
1899             'upload_date':  None,
1900             'title':    file_title,
1901             'ext':      file_extension.decode('utf-8'),
1902         }]
1903
1904
1905 class FacebookIE(InfoExtractor):
1906     """Information Extractor for Facebook"""
1907
1908     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1909     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1910     _NETRC_MACHINE = 'facebook'
1911     IE_NAME = u'facebook'
1912
1913     def report_login(self):
1914         """Report attempt to log in."""
1915         self.to_screen(u'Logging in')
1916
1917     def _real_initialize(self):
1918         if self._downloader is None:
1919             return
1920
1921         useremail = None
1922         password = None
1923         downloader_params = self._downloader.params
1924
1925         # Attempt to use provided username and password or .netrc data
1926         if downloader_params.get('username', None) is not None:
1927             useremail = downloader_params['username']
1928             password = downloader_params['password']
1929         elif downloader_params.get('usenetrc', False):
1930             try:
1931                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1932                 if info is not None:
1933                     useremail = info[0]
1934                     password = info[2]
1935                 else:
1936                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1937             except (IOError, netrc.NetrcParseError) as err:
1938                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1939                 return
1940
1941         if useremail is None:
1942             return
1943
1944         # Log in
1945         login_form = {
1946             'email': useremail,
1947             'pass': password,
1948             'login': 'Log+In'
1949             }
1950         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1951         try:
1952             self.report_login()
1953             login_results = compat_urllib_request.urlopen(request).read()
1954             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1955                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1956                 return
1957         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1958             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1959             return
1960
1961     def _real_extract(self, url):
1962         mobj = re.match(self._VALID_URL, url)
1963         if mobj is None:
1964             raise ExtractorError(u'Invalid URL: %s' % url)
1965         video_id = mobj.group('ID')
1966
1967         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1968         webpage = self._download_webpage(url, video_id)
1969
1970         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1971         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1972         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1973         if not m:
1974             raise ExtractorError(u'Cannot parse data')
1975         data = dict(json.loads(m.group(1)))
1976         params_raw = compat_urllib_parse.unquote(data['params'])
1977         params = json.loads(params_raw)
1978         video_data = params['video_data'][0]
1979         video_url = video_data.get('hd_src')
1980         if not video_url:
1981             video_url = video_data['sd_src']
1982         if not video_url:
1983             raise ExtractorError(u'Cannot find video URL')
1984         video_duration = int(video_data['video_duration'])
1985         thumbnail = video_data['thumbnail_src']
1986
1987         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1988             webpage, u'title')
1989
1990         info = {
1991             'id': video_id,
1992             'title': video_title,
1993             'url': video_url,
1994             'ext': 'mp4',
1995             'duration': video_duration,
1996             'thumbnail': thumbnail,
1997         }
1998         return [info]
1999
2000
2001 class BlipTVIE(InfoExtractor):
2002     """Information extractor for blip.tv"""
2003
2004     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
2005     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2006     IE_NAME = u'blip.tv'
2007
2008     def report_direct_download(self, title):
2009         """Report information extraction."""
2010         self.to_screen(u'%s: Direct download detected' % title)
2011
2012     def _real_extract(self, url):
2013         mobj = re.match(self._VALID_URL, url)
2014         if mobj is None:
2015             raise ExtractorError(u'Invalid URL: %s' % url)
2016
2017         # See https://github.com/rg3/youtube-dl/issues/857
2018         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
2019         if api_mobj is not None:
2020             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
2021         urlp = compat_urllib_parse_urlparse(url)
2022         if urlp.path.startswith('/play/'):
2023             request = compat_urllib_request.Request(url)
2024             response = compat_urllib_request.urlopen(request)
2025             redirecturl = response.geturl()
2026             rurlp = compat_urllib_parse_urlparse(redirecturl)
2027             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2028             url = 'http://blip.tv/a/a-' + file_id
2029             return self._real_extract(url)
2030
2031
2032         if '?' in url:
2033             cchar = '&'
2034         else:
2035             cchar = '?'
2036         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2037         request = compat_urllib_request.Request(json_url)
2038         request.add_header('User-Agent', 'iTunes/10.6.1')
2039         self.report_extraction(mobj.group(1))
2040         info = None
2041         try:
2042             urlh = compat_urllib_request.urlopen(request)
2043             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2044                 basename = url.split('/')[-1]
2045                 title,ext = os.path.splitext(basename)
2046                 title = title.decode('UTF-8')
2047                 ext = ext.replace('.', '')
2048                 self.report_direct_download(title)
2049                 info = {
2050                     'id': title,
2051                     'url': url,
2052                     'uploader': None,
2053                     'upload_date': None,
2054                     'title': title,
2055                     'ext': ext,
2056                     'urlhandle': urlh
2057                 }
2058         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2059             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2060         if info is None: # Regular URL
2061             try:
2062                 json_code_bytes = urlh.read()
2063                 json_code = json_code_bytes.decode('utf-8')
2064             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2065                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2066
2067             try:
2068                 json_data = json.loads(json_code)
2069                 if 'Post' in json_data:
2070                     data = json_data['Post']
2071                 else:
2072                     data = json_data
2073
2074                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2075                 video_url = data['media']['url']
2076                 umobj = re.match(self._URL_EXT, video_url)
2077                 if umobj is None:
2078                     raise ValueError('Can not determine filename extension')
2079                 ext = umobj.group(1)
2080
2081                 info = {
2082                     'id': data['item_id'],
2083                     'url': video_url,
2084                     'uploader': data['display_name'],
2085                     'upload_date': upload_date,
2086                     'title': data['title'],
2087                     'ext': ext,
2088                     'format': data['media']['mimeType'],
2089                     'thumbnail': data['thumbnailUrl'],
2090                     'description': data['description'],
2091                     'player_url': data['embedUrl'],
2092                     'user_agent': 'iTunes/10.6.1',
2093                 }
2094             except (ValueError,KeyError) as err:
2095                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2096
2097         return [info]
2098
2099
2100 class MyVideoIE(InfoExtractor):
2101     """Information Extractor for myvideo.de."""
2102
2103     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2104     IE_NAME = u'myvideo'
2105
2106     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2107     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2108     # https://github.com/rg3/youtube-dl/pull/842
2109     def __rc4crypt(self,data, key):
2110         x = 0
2111         box = list(range(256))
2112         for i in list(range(256)):
2113             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2114             box[i], box[x] = box[x], box[i]
2115         x = 0
2116         y = 0
2117         out = ''
2118         for char in data:
2119             x = (x + 1) % 256
2120             y = (y + box[x]) % 256
2121             box[x], box[y] = box[y], box[x]
2122             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2123         return out
2124
2125     def __md5(self,s):
2126         return hashlib.md5(s).hexdigest().encode()
2127
2128     def _real_extract(self,url):
2129         mobj = re.match(self._VALID_URL, url)
2130         if mobj is None:
2131             raise ExtractorError(u'invalid URL: %s' % url)
2132
2133         video_id = mobj.group(1)
2134
2135         GK = (
2136           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2137           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2138           b'TnpsbA0KTVRkbU1tSTRNdz09'
2139         )
2140
2141         # Get video webpage
2142         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2143         webpage = self._download_webpage(webpage_url, video_id)
2144
2145         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2146         if mobj is not None:
2147             self.report_extraction(video_id)
2148             video_url = mobj.group(1) + '.flv'
2149
2150             video_title = self._html_search_regex('<title>([^<]+)</title>',
2151                 webpage, u'title')
2152
2153             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2154
2155             return [{
2156                 'id':       video_id,
2157                 'url':      video_url,
2158                 'uploader': None,
2159                 'upload_date':  None,
2160                 'title':    video_title,
2161                 'ext':      u'flv',
2162             }]
2163
2164         # try encxml
2165         mobj = re.search('var flashvars={(.+?)}', webpage)
2166         if mobj is None:
2167             raise ExtractorError(u'Unable to extract video')
2168
2169         params = {}
2170         encxml = ''
2171         sec = mobj.group(1)
2172         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2173             if not a == '_encxml':
2174                 params[a] = b
2175             else:
2176                 encxml = compat_urllib_parse.unquote(b)
2177         if not params.get('domain'):
2178             params['domain'] = 'www.myvideo.de'
2179         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2180         if 'flash_playertype=MTV' in xmldata_url:
2181             self._downloader.report_warning(u'avoiding MTV player')
2182             xmldata_url = (
2183                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2184                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2185             ) % video_id
2186
2187         # get enc data
2188         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2189         enc_data_b = binascii.unhexlify(enc_data)
2190         sk = self.__md5(
2191             base64.b64decode(base64.b64decode(GK)) +
2192             self.__md5(
2193                 str(video_id).encode('utf-8')
2194             )
2195         )
2196         dec_data = self.__rc4crypt(enc_data_b, sk)
2197
2198         # extracting infos
2199         self.report_extraction(video_id)
2200
2201         video_url = None
2202         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2203         if mobj:
2204             video_url = compat_urllib_parse.unquote(mobj.group(1))
2205             if 'myvideo2flash' in video_url:
2206                 self._downloader.report_warning(u'forcing RTMPT ...')
2207                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2208
2209         if not video_url:
2210             # extract non rtmp videos
2211             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2212             if mobj is None:
2213                 raise ExtractorError(u'unable to extract url')
2214             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2215
2216         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2217         video_file = compat_urllib_parse.unquote(video_file)
2218
2219         if not video_file.endswith('f4m'):
2220             ppath, prefix = video_file.split('.')
2221             video_playpath = '%s:%s' % (prefix, ppath)
2222             video_hls_playlist = ''
2223         else:
2224             video_playpath = ''
2225             video_hls_playlist = (
2226                 video_filepath + video_file
2227             ).replace('.f4m', '.m3u8')
2228
2229         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2230         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2231
2232         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2233             webpage, u'title')
2234
2235         return [{
2236             'id':                 video_id,
2237             'url':                video_url,
2238             'tc_url':             video_url,
2239             'uploader':           None,
2240             'upload_date':        None,
2241             'title':              video_title,
2242             'ext':                u'flv',
2243             'play_path':          video_playpath,
2244             'video_file':         video_file,
2245             'video_hls_playlist': video_hls_playlist,
2246             'player_url':         video_swfobj,
2247         }]
2248
2249
2250 class ComedyCentralIE(InfoExtractor):
2251     """Information extractor for The Daily Show and Colbert Report """
2252
2253     # urls can be abbreviations like :thedailyshow or :colbert
2254     # urls for episodes like:
2255     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2256     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2257     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2258     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2259                       |(https?://)?(www\.)?
2260                           (?P<showname>thedailyshow|colbertnation)\.com/
2261                          (full-episodes/(?P<episode>.*)|
2262                           (?P<clip>
2263                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2264                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2265                      $"""
2266
2267     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2268
2269     _video_extensions = {
2270         '3500': 'mp4',
2271         '2200': 'mp4',
2272         '1700': 'mp4',
2273         '1200': 'mp4',
2274         '750': 'mp4',
2275         '400': 'mp4',
2276     }
2277     _video_dimensions = {
2278         '3500': '1280x720',
2279         '2200': '960x540',
2280         '1700': '768x432',
2281         '1200': '640x360',
2282         '750': '512x288',
2283         '400': '384x216',
2284     }
2285
2286     @classmethod
2287     def suitable(cls, url):
2288         """Receives a URL and returns True if suitable for this IE."""
2289         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2290
2291     def _print_formats(self, formats):
2292         print('Available formats:')
2293         for x in formats:
2294             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2295
2296
2297     def _real_extract(self, url):
2298         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2299         if mobj is None:
2300             raise ExtractorError(u'Invalid URL: %s' % url)
2301
2302         if mobj.group('shortname'):
2303             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2304                 url = u'http://www.thedailyshow.com/full-episodes/'
2305             else:
2306                 url = u'http://www.colbertnation.com/full-episodes/'
2307             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2308             assert mobj is not None
2309
2310         if mobj.group('clip'):
2311             if mobj.group('showname') == 'thedailyshow':
2312                 epTitle = mobj.group('tdstitle')
2313             else:
2314                 epTitle = mobj.group('cntitle')
2315             dlNewest = False
2316         else:
2317             dlNewest = not mobj.group('episode')
2318             if dlNewest:
2319                 epTitle = mobj.group('showname')
2320             else:
2321                 epTitle = mobj.group('episode')
2322
2323         self.report_extraction(epTitle)
2324         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2325         if dlNewest:
2326             url = htmlHandle.geturl()
2327             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2328             if mobj is None:
2329                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2330             if mobj.group('episode') == '':
2331                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2332             epTitle = mobj.group('episode')
2333
2334         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2335
2336         if len(mMovieParams) == 0:
2337             # The Colbert Report embeds the information in a without
2338             # a URL prefix; so extract the alternate reference
2339             # and then add the URL prefix manually.
2340
2341             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2342             if len(altMovieParams) == 0:
2343                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2344             else:
2345                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2346
2347         uri = mMovieParams[0][1]
2348         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2349         indexXml = self._download_webpage(indexUrl, epTitle,
2350                                           u'Downloading show index',
2351                                           u'unable to download episode index')
2352
2353         results = []
2354
2355         idoc = xml.etree.ElementTree.fromstring(indexXml)
2356         itemEls = idoc.findall('.//item')
2357         for partNum,itemEl in enumerate(itemEls):
2358             mediaId = itemEl.findall('./guid')[0].text
2359             shortMediaId = mediaId.split(':')[-1]
2360             showId = mediaId.split(':')[-2].replace('.com', '')
2361             officialTitle = itemEl.findall('./title')[0].text
2362             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2363
2364             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2365                         compat_urllib_parse.urlencode({'uri': mediaId}))
2366             configXml = self._download_webpage(configUrl, epTitle,
2367                                                u'Downloading configuration for %s' % shortMediaId)
2368
2369             cdoc = xml.etree.ElementTree.fromstring(configXml)
2370             turls = []
2371             for rendition in cdoc.findall('.//rendition'):
2372                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2373                 turls.append(finfo)
2374
2375             if len(turls) == 0:
2376                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2377                 continue
2378
2379             if self._downloader.params.get('listformats', None):
2380                 self._print_formats([i[0] for i in turls])
2381                 return
2382
2383             # For now, just pick the highest bitrate
2384             format,rtmp_video_url = turls[-1]
2385
2386             # Get the format arg from the arg stream
2387             req_format = self._downloader.params.get('format', None)
2388
2389             # Select format if we can find one
2390             for f,v in turls:
2391                 if f == req_format:
2392                     format, rtmp_video_url = f, v
2393                     break
2394
2395             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2396             if not m:
2397                 raise ExtractorError(u'Cannot transform RTMP url')
2398             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2399             video_url = base + m.group('finalid')
2400
2401             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2402             info = {
2403                 'id': shortMediaId,
2404                 'url': video_url,
2405                 'uploader': showId,
2406                 'upload_date': officialDate,
2407                 'title': effTitle,
2408                 'ext': 'mp4',
2409                 'format': format,
2410                 'thumbnail': None,
2411                 'description': officialTitle,
2412             }
2413             results.append(info)
2414
2415         return results
2416
2417
2418 class EscapistIE(InfoExtractor):
2419     """Information extractor for The Escapist """
2420
2421     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2422     IE_NAME = u'escapist'
2423
2424     def _real_extract(self, url):
2425         mobj = re.match(self._VALID_URL, url)
2426         if mobj is None:
2427             raise ExtractorError(u'Invalid URL: %s' % url)
2428         showName = mobj.group('showname')
2429         videoId = mobj.group('episode')
2430
2431         self.report_extraction(videoId)
2432         webpage = self._download_webpage(url, videoId)
2433
2434         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2435             webpage, u'description', fatal=False)
2436
2437         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2438             webpage, u'thumbnail', fatal=False)
2439
2440         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2441             webpage, u'player url')
2442
2443         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2444             webpage, u'player url').split(' : ')[-1]
2445
2446         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2447         configUrl = compat_urllib_parse.unquote(configUrl)
2448
2449         configJSON = self._download_webpage(configUrl, videoId,
2450                                             u'Downloading configuration',
2451                                             u'unable to download configuration')
2452
2453         # Technically, it's JavaScript, not JSON
2454         configJSON = configJSON.replace("'", '"')
2455
2456         try:
2457             config = json.loads(configJSON)
2458         except (ValueError,) as err:
2459             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2460
2461         playlist = config['playlist']
2462         videoUrl = playlist[1]['url']
2463
2464         info = {
2465             'id': videoId,
2466             'url': videoUrl,
2467             'uploader': showName,
2468             'upload_date': None,
2469             'title': title,
2470             'ext': 'mp4',
2471             'thumbnail': imgUrl,
2472             'description': videoDesc,
2473             'player_url': playerUrl,
2474         }
2475
2476         return [info]
2477
2478 class CollegeHumorIE(InfoExtractor):
2479     """Information extractor for collegehumor.com"""
2480
2481     _WORKING = False
2482     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2483     IE_NAME = u'collegehumor'
2484
2485     def report_manifest(self, video_id):
2486         """Report information extraction."""
2487         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2488
2489     def _real_extract(self, url):
2490         mobj = re.match(self._VALID_URL, url)
2491         if mobj is None:
2492             raise ExtractorError(u'Invalid URL: %s' % url)
2493         video_id = mobj.group('videoid')
2494
2495         info = {
2496             'id': video_id,
2497             'uploader': None,
2498             'upload_date': None,
2499         }
2500
2501         self.report_extraction(video_id)
2502         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2503         try:
2504             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2505         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2506             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2507
2508         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2509         try:
2510             videoNode = mdoc.findall('./video')[0]
2511             info['description'] = videoNode.findall('./description')[0].text
2512             info['title'] = videoNode.findall('./caption')[0].text
2513             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2514             manifest_url = videoNode.findall('./file')[0].text
2515         except IndexError:
2516             raise ExtractorError(u'Invalid metadata XML file')
2517
2518         manifest_url += '?hdcore=2.10.3'
2519         self.report_manifest(video_id)
2520         try:
2521             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2522         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2523             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2524
2525         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2526         try:
2527             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2528             node_id = media_node.attrib['url']
2529             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2530         except IndexError as err:
2531             raise ExtractorError(u'Invalid manifest file')
2532
2533         url_pr = compat_urllib_parse_urlparse(manifest_url)
2534         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2535
2536         info['url'] = url
2537         info['ext'] = 'f4f'
2538         return [info]
2539
2540
2541 class XVideosIE(InfoExtractor):
2542     """Information extractor for xvideos.com"""
2543
2544     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2545     IE_NAME = u'xvideos'
2546
2547     def _real_extract(self, url):
2548         mobj = re.match(self._VALID_URL, url)
2549         if mobj is None:
2550             raise ExtractorError(u'Invalid URL: %s' % url)
2551         video_id = mobj.group(1)
2552
2553         webpage = self._download_webpage(url, video_id)
2554
2555         self.report_extraction(video_id)
2556
2557         # Extract video URL
2558         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2559             webpage, u'video URL'))
2560
2561         # Extract title
2562         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2563             webpage, u'title')
2564
2565         # Extract video thumbnail
2566         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2567             webpage, u'thumbnail', fatal=False)
2568
2569         info = {
2570             'id': video_id,
2571             'url': video_url,
2572             'uploader': None,
2573             'upload_date': None,
2574             'title': video_title,
2575             'ext': 'flv',
2576             'thumbnail': video_thumbnail,
2577             'description': None,
2578         }
2579
2580         return [info]
2581
2582
2583 class SoundcloudIE(InfoExtractor):
2584     """Information extractor for soundcloud.com
2585        To access the media, the uid of the song and a stream token
2586        must be extracted from the page source and the script must make
2587        a request to media.soundcloud.com/crossdomain.xml. Then
2588        the media can be grabbed by requesting from an url composed
2589        of the stream token and uid
2590      """
2591
2592     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2593     IE_NAME = u'soundcloud'
2594
2595     def report_resolve(self, video_id):
2596         """Report information extraction."""
2597         self.to_screen(u'%s: Resolving id' % video_id)
2598
2599     def _real_extract(self, url):
2600         mobj = re.match(self._VALID_URL, url)
2601         if mobj is None:
2602             raise ExtractorError(u'Invalid URL: %s' % url)
2603
2604         # extract uploader (which is in the url)
2605         uploader = mobj.group(1)
2606         # extract simple title (uploader + slug of song title)
2607         slug_title =  mobj.group(2)
2608         simple_title = uploader + u'-' + slug_title
2609         full_title = '%s/%s' % (uploader, slug_title)
2610
2611         self.report_resolve(full_title)
2612
2613         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2614         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2615         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2616
2617         info = json.loads(info_json)
2618         video_id = info['id']
2619         self.report_extraction(full_title)
2620
2621         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2622         stream_json = self._download_webpage(streams_url, full_title,
2623                                              u'Downloading stream definitions',
2624                                              u'unable to download stream definitions')
2625
2626         streams = json.loads(stream_json)
2627         mediaURL = streams['http_mp3_128_url']
2628         upload_date = unified_strdate(info['created_at'])
2629
2630         return [{
2631             'id':       info['id'],
2632             'url':      mediaURL,
2633             'uploader': info['user']['username'],
2634             'upload_date': upload_date,
2635             'title':    info['title'],
2636             'ext':      u'mp3',
2637             'description': info['description'],
2638         }]
2639
2640 class SoundcloudSetIE(InfoExtractor):
2641     """Information extractor for soundcloud.com sets
2642        To access the media, the uid of the song and a stream token
2643        must be extracted from the page source and the script must make
2644        a request to media.soundcloud.com/crossdomain.xml. Then
2645        the media can be grabbed by requesting from an url composed
2646        of the stream token and uid
2647      """
2648
2649     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2650     IE_NAME = u'soundcloud:set'
2651
2652     def report_resolve(self, video_id):
2653         """Report information extraction."""
2654         self.to_screen(u'%s: Resolving id' % video_id)
2655
2656     def _real_extract(self, url):
2657         mobj = re.match(self._VALID_URL, url)
2658         if mobj is None:
2659             raise ExtractorError(u'Invalid URL: %s' % url)
2660
2661         # extract uploader (which is in the url)
2662         uploader = mobj.group(1)
2663         # extract simple title (uploader + slug of song title)
2664         slug_title =  mobj.group(2)
2665         simple_title = uploader + u'-' + slug_title
2666         full_title = '%s/sets/%s' % (uploader, slug_title)
2667
2668         self.report_resolve(full_title)
2669
2670         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2671         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2672         info_json = self._download_webpage(resolv_url, full_title)
2673
2674         videos = []
2675         info = json.loads(info_json)
2676         if 'errors' in info:
2677             for err in info['errors']:
2678                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2679             return
2680
2681         self.report_extraction(full_title)
2682         for track in info['tracks']:
2683             video_id = track['id']
2684
2685             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2686             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2687
2688             self.report_extraction(video_id)
2689             streams = json.loads(stream_json)
2690             mediaURL = streams['http_mp3_128_url']
2691
2692             videos.append({
2693                 'id':       video_id,
2694                 'url':      mediaURL,
2695                 'uploader': track['user']['username'],
2696                 'upload_date':  unified_strdate(track['created_at']),
2697                 'title':    track['title'],
2698                 'ext':      u'mp3',
2699                 'description': track['description'],
2700             })
2701         return videos
2702
2703
2704 class InfoQIE(InfoExtractor):
2705     """Information extractor for infoq.com"""
2706     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2707
2708     def _real_extract(self, url):
2709         mobj = re.match(self._VALID_URL, url)
2710         if mobj is None:
2711             raise ExtractorError(u'Invalid URL: %s' % url)
2712
2713         webpage = self._download_webpage(url, video_id=url)
2714         self.report_extraction(url)
2715
2716         # Extract video URL
2717         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2718         if mobj is None:
2719             raise ExtractorError(u'Unable to extract video url')
2720         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2721         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2722
2723         # Extract title
2724         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2725             webpage, u'title')
2726
2727         # Extract description
2728         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2729             webpage, u'description', fatal=False)
2730
2731         video_filename = video_url.split('/')[-1]
2732         video_id, extension = video_filename.split('.')
2733
2734         info = {
2735             'id': video_id,
2736             'url': video_url,
2737             'uploader': None,
2738             'upload_date': None,
2739             'title': video_title,
2740             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2741             'thumbnail': None,
2742             'description': video_description,
2743         }
2744
2745         return [info]
2746
2747 class MixcloudIE(InfoExtractor):
2748     """Information extractor for www.mixcloud.com"""
2749
2750     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2751     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2752     IE_NAME = u'mixcloud'
2753
2754     def report_download_json(self, file_id):
2755         """Report JSON download."""
2756         self.to_screen(u'Downloading json')
2757
2758     def get_urls(self, jsonData, fmt, bitrate='best'):
2759         """Get urls from 'audio_formats' section in json"""
2760         file_url = None
2761         try:
2762             bitrate_list = jsonData[fmt]
2763             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2764                 bitrate = max(bitrate_list) # select highest
2765
2766             url_list = jsonData[fmt][bitrate]
2767         except TypeError: # we have no bitrate info.
2768             url_list = jsonData[fmt]
2769         return url_list
2770
2771     def check_urls(self, url_list):
2772         """Returns 1st active url from list"""
2773         for url in url_list:
2774             try:
2775                 compat_urllib_request.urlopen(url)
2776                 return url
2777             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2778                 url = None
2779
2780         return None
2781
2782     def _print_formats(self, formats):
2783         print('Available formats:')
2784         for fmt in formats.keys():
2785             for b in formats[fmt]:
2786                 try:
2787                     ext = formats[fmt][b][0]
2788                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2789                 except TypeError: # we have no bitrate info
2790                     ext = formats[fmt][0]
2791                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2792                     break
2793
2794     def _real_extract(self, url):
2795         mobj = re.match(self._VALID_URL, url)
2796         if mobj is None:
2797             raise ExtractorError(u'Invalid URL: %s' % url)
2798         # extract uploader & filename from url
2799         uploader = mobj.group(1).decode('utf-8')
2800         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2801
2802         # construct API request
2803         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2804         # retrieve .json file with links to files
2805         request = compat_urllib_request.Request(file_url)
2806         try:
2807             self.report_download_json(file_url)
2808             jsonData = compat_urllib_request.urlopen(request).read()
2809         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2810             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2811
2812         # parse JSON
2813         json_data = json.loads(jsonData)
2814         player_url = json_data['player_swf_url']
2815         formats = dict(json_data['audio_formats'])
2816
2817         req_format = self._downloader.params.get('format', None)
2818         bitrate = None
2819
2820         if self._downloader.params.get('listformats', None):
2821             self._print_formats(formats)
2822             return
2823
2824         if req_format is None or req_format == 'best':
2825             for format_param in formats.keys():
2826                 url_list = self.get_urls(formats, format_param)
2827                 # check urls
2828                 file_url = self.check_urls(url_list)
2829                 if file_url is not None:
2830                     break # got it!
2831         else:
2832             if req_format not in formats:
2833                 raise ExtractorError(u'Format is not available')
2834
2835             url_list = self.get_urls(formats, req_format)
2836             file_url = self.check_urls(url_list)
2837             format_param = req_format
2838
2839         return [{
2840             'id': file_id.decode('utf-8'),
2841             'url': file_url.decode('utf-8'),
2842             'uploader': uploader.decode('utf-8'),
2843             'upload_date': None,
2844             'title': json_data['name'],
2845             'ext': file_url.split('.')[-1].decode('utf-8'),
2846             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2847             'thumbnail': json_data['thumbnail_url'],
2848             'description': json_data['description'],
2849             'player_url': player_url.decode('utf-8'),
2850         }]
2851
2852 class StanfordOpenClassroomIE(InfoExtractor):
2853     """Information extractor for Stanford's Open ClassRoom"""
2854
2855     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2856     IE_NAME = u'stanfordoc'
2857
2858     def _real_extract(self, url):
2859         mobj = re.match(self._VALID_URL, url)
2860         if mobj is None:
2861             raise ExtractorError(u'Invalid URL: %s' % url)
2862
2863         if mobj.group('course') and mobj.group('video'): # A specific video
2864             course = mobj.group('course')
2865             video = mobj.group('video')
2866             info = {
2867                 'id': course + '_' + video,
2868                 'uploader': None,
2869                 'upload_date': None,
2870             }
2871
2872             self.report_extraction(info['id'])
2873             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2874             xmlUrl = baseUrl + video + '.xml'
2875             try:
2876                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2877             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2878                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2879             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2880             try:
2881                 info['title'] = mdoc.findall('./title')[0].text
2882                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2883             except IndexError:
2884                 raise ExtractorError(u'Invalid metadata XML file')
2885             info['ext'] = info['url'].rpartition('.')[2]
2886             return [info]
2887         elif mobj.group('course'): # A course page
2888             course = mobj.group('course')
2889             info = {
2890                 'id': course,
2891                 'type': 'playlist',
2892                 'uploader': None,
2893                 'upload_date': None,
2894             }
2895
2896             coursepage = self._download_webpage(url, info['id'],
2897                                         note='Downloading course info page',
2898                                         errnote='Unable to download course info page')
2899
2900             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2901
2902             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2903                 coursepage, u'description', fatal=False)
2904
2905             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2906             info['list'] = [
2907                 {
2908                     'type': 'reference',
2909                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2910                 }
2911                     for vpage in links]
2912             results = []
2913             for entry in info['list']:
2914                 assert entry['type'] == 'reference'
2915                 results += self.extract(entry['url'])
2916             return results
2917         else: # Root page
2918             info = {
2919                 'id': 'Stanford OpenClassroom',
2920                 'type': 'playlist',
2921                 'uploader': None,
2922                 'upload_date': None,
2923             }
2924
2925             self.report_download_webpage(info['id'])
2926             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2927             try:
2928                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2929             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2930                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2931
2932             info['title'] = info['id']
2933
2934             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2935             info['list'] = [
2936                 {
2937                     'type': 'reference',
2938                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2939                 }
2940                     for cpage in links]
2941
2942             results = []
2943             for entry in info['list']:
2944                 assert entry['type'] == 'reference'
2945                 results += self.extract(entry['url'])
2946             return results
2947
2948 class MTVIE(InfoExtractor):
2949     """Information extractor for MTV.com"""
2950
2951     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2952     IE_NAME = u'mtv'
2953
2954     def _real_extract(self, url):
2955         mobj = re.match(self._VALID_URL, url)
2956         if mobj is None:
2957             raise ExtractorError(u'Invalid URL: %s' % url)
2958         if not mobj.group('proto'):
2959             url = 'http://' + url
2960         video_id = mobj.group('videoid')
2961
2962         webpage = self._download_webpage(url, video_id)
2963
2964         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2965             webpage, u'song name', fatal=False)
2966
2967         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2968             webpage, u'title')
2969
2970         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2971             webpage, u'mtvn_uri', fatal=False)
2972
2973         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2974             webpage, u'content id', fatal=False)
2975
2976         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2977         self.report_extraction(video_id)
2978         request = compat_urllib_request.Request(videogen_url)
2979         try:
2980             metadataXml = compat_urllib_request.urlopen(request).read()
2981         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2982             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2983
2984         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2985         renditions = mdoc.findall('.//rendition')
2986
2987         # For now, always pick the highest quality.
2988         rendition = renditions[-1]
2989
2990         try:
2991             _,_,ext = rendition.attrib['type'].partition('/')
2992             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2993             video_url = rendition.find('./src').text
2994         except KeyError:
2995             raise ExtractorError('Invalid rendition field.')
2996
2997         info = {
2998             'id': video_id,
2999             'url': video_url,
3000             'uploader': performer,
3001             'upload_date': None,
3002             'title': video_title,
3003             'ext': ext,
3004             'format': format,
3005         }
3006
3007         return [info]
3008
3009
3010 class YoukuIE(InfoExtractor):
3011     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3012
3013     def _gen_sid(self):
3014         nowTime = int(time.time() * 1000)
3015         random1 = random.randint(1000,1998)
3016         random2 = random.randint(1000,9999)
3017
3018         return "%d%d%d" %(nowTime,random1,random2)
3019
3020     def _get_file_ID_mix_string(self, seed):
3021         mixed = []
3022         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3023         seed = float(seed)
3024         for i in range(len(source)):
3025             seed  =  (seed * 211 + 30031 ) % 65536
3026             index  =  math.floor(seed / 65536 * len(source) )
3027             mixed.append(source[int(index)])
3028             source.remove(source[int(index)])
3029         #return ''.join(mixed)
3030         return mixed
3031
3032     def _get_file_id(self, fileId, seed):
3033         mixed = self._get_file_ID_mix_string(seed)
3034         ids = fileId.split('*')
3035         realId = []
3036         for ch in ids:
3037             if ch:
3038                 realId.append(mixed[int(ch)])
3039         return ''.join(realId)
3040
3041     def _real_extract(self, url):
3042         mobj = re.match(self._VALID_URL, url)
3043         if mobj is None:
3044             raise ExtractorError(u'Invalid URL: %s' % url)
3045         video_id = mobj.group('ID')
3046
3047         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3048
3049         jsondata = self._download_webpage(info_url, video_id)
3050
3051         self.report_extraction(video_id)
3052         try:
3053             config = json.loads(jsondata)
3054
3055             video_title =  config['data'][0]['title']
3056             seed = config['data'][0]['seed']
3057
3058             format = self._downloader.params.get('format', None)
3059             supported_format = list(config['data'][0]['streamfileids'].keys())
3060
3061             if format is None or format == 'best':
3062                 if 'hd2' in supported_format:
3063                     format = 'hd2'
3064                 else:
3065                     format = 'flv'
3066                 ext = u'flv'
3067             elif format == 'worst':
3068                 format = 'mp4'
3069                 ext = u'mp4'
3070             else:
3071                 format = 'flv'
3072                 ext = u'flv'
3073
3074
3075             fileid = config['data'][0]['streamfileids'][format]
3076             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3077         except (UnicodeDecodeError, ValueError, KeyError):
3078             raise ExtractorError(u'Unable to extract info section')
3079
3080         files_info=[]
3081         sid = self._gen_sid()
3082         fileid = self._get_file_id(fileid, seed)
3083
3084         #column 8,9 of fileid represent the segment number
3085         #fileid[7:9] should be changed
3086         for index, key in enumerate(keys):
3087
3088             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3089             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3090
3091             info = {
3092                 'id': '%s_part%02d' % (video_id, index),
3093                 'url': download_url,
3094                 'uploader': None,
3095                 'upload_date': None,
3096                 'title': video_title,
3097                 'ext': ext,
3098             }
3099             files_info.append(info)
3100
3101         return files_info
3102
3103
3104 class XNXXIE(InfoExtractor):
3105     """Information extractor for xnxx.com"""
3106
3107     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3108     IE_NAME = u'xnxx'
3109     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3110     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3111     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3112
3113     def _real_extract(self, url):
3114         mobj = re.match(self._VALID_URL, url)
3115         if mobj is None:
3116             raise ExtractorError(u'Invalid URL: %s' % url)
3117         video_id = mobj.group(1)
3118
3119         # Get webpage content
3120         webpage = self._download_webpage(url, video_id)
3121
3122         video_url = self._search_regex(self.VIDEO_URL_RE,
3123             webpage, u'video URL')
3124         video_url = compat_urllib_parse.unquote(video_url)
3125
3126         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3127             webpage, u'title')
3128
3129         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3130             webpage, u'thumbnail', fatal=False)
3131
3132         return [{
3133             'id': video_id,
3134             'url': video_url,
3135             'uploader': None,
3136             'upload_date': None,
3137             'title': video_title,
3138             'ext': 'flv',
3139             'thumbnail': video_thumbnail,
3140             'description': None,
3141         }]
3142
3143
3144 class GooglePlusIE(InfoExtractor):
3145     """Information extractor for plus.google.com."""
3146
3147     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3148     IE_NAME = u'plus.google'
3149
3150     def _real_extract(self, url):
3151         # Extract id from URL
3152         mobj = re.match(self._VALID_URL, url)
3153         if mobj is None:
3154             raise ExtractorError(u'Invalid URL: %s' % url)
3155
3156         post_url = mobj.group(0)
3157         video_id = mobj.group(1)
3158
3159         video_extension = 'flv'
3160
3161         # Step 1, Retrieve post webpage to extract further information
3162         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3163
3164         self.report_extraction(video_id)
3165
3166         # Extract update date
3167         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3168             webpage, u'upload date', fatal=False)
3169         if upload_date:
3170             # Convert timestring to a format suitable for filename
3171             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3172             upload_date = upload_date.strftime('%Y%m%d')
3173
3174         # Extract uploader
3175         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3176             webpage, u'uploader', fatal=False)
3177
3178         # Extract title
3179         # Get the first line for title
3180         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3181             webpage, 'title', default=u'NA')
3182
3183         # Step 2, Stimulate clicking the image box to launch video
3184         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3185             webpage, u'video page URL')
3186         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3187
3188         # Extract video links on video page
3189         """Extract video links of all sizes"""
3190         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3191         mobj = re.findall(pattern, webpage)
3192         if len(mobj) == 0:
3193             raise ExtractorError(u'Unable to extract video links')
3194
3195         # Sort in resolution
3196         links = sorted(mobj)
3197
3198         # Choose the lowest of the sort, i.e. highest resolution
3199         video_url = links[-1]
3200         # Only get the url. The resolution part in the tuple has no use anymore
3201         video_url = video_url[-1]
3202         # Treat escaped \u0026 style hex
3203         try:
3204             video_url = video_url.decode("unicode_escape")
3205         except AttributeError: # Python 3
3206             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3207
3208
3209         return [{
3210             'id':       video_id,
3211             'url':      video_url,
3212             'uploader': uploader,
3213             'upload_date':  upload_date,
3214             'title':    video_title,
3215             'ext':      video_extension,
3216         }]
3217
3218 class NBAIE(InfoExtractor):
3219     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3220     IE_NAME = u'nba'
3221
3222     def _real_extract(self, url):
3223         mobj = re.match(self._VALID_URL, url)
3224         if mobj is None:
3225             raise ExtractorError(u'Invalid URL: %s' % url)
3226
3227         video_id = mobj.group(1)
3228
3229         webpage = self._download_webpage(url, video_id)
3230
3231         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3232
3233         shortened_video_id = video_id.rpartition('/')[2]
3234         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3235             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3236
3237         # It isn't there in the HTML it returns to us
3238         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3239
3240         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3241
3242         info = {
3243             'id': shortened_video_id,
3244             'url': video_url,
3245             'ext': 'mp4',
3246             'title': title,
3247             # 'uploader_date': uploader_date,
3248             'description': description,
3249         }
3250         return [info]
3251
3252 class JustinTVIE(InfoExtractor):
3253     """Information extractor for justin.tv and twitch.tv"""
3254     # TODO: One broadcast may be split into multiple videos. The key
3255     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3256     # starts at 1 and increases. Can we treat all parts as one video?
3257
3258     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3259         (?:
3260             (?P<channelid>[^/]+)|
3261             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3262             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3263         )
3264         /?(?:\#.*)?$
3265         """
3266     _JUSTIN_PAGE_LIMIT = 100
3267     IE_NAME = u'justin.tv'
3268
3269     def report_download_page(self, channel, offset):
3270         """Report attempt to download a single page of videos."""
3271         self.to_screen(u'%s: Downloading video information from %d to %d' %
3272                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3273
3274     # Return count of items, list of *valid* items
3275     def _parse_page(self, url, video_id):
3276         webpage = self._download_webpage(url, video_id,
3277                                          u'Downloading video info JSON',
3278                                          u'unable to download video info JSON')
3279
3280         response = json.loads(webpage)
3281         if type(response) != list:
3282             error_text = response.get('error', 'unknown error')
3283             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3284         info = []
3285         for clip in response:
3286             video_url = clip['video_file_url']
3287             if video_url:
3288                 video_extension = os.path.splitext(video_url)[1][1:]
3289                 video_date = re.sub('-', '', clip['start_time'][:10])
3290                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3291                 video_id = clip['id']
3292                 video_title = clip.get('title', video_id)
3293                 info.append({
3294                     'id': video_id,
3295                     'url': video_url,
3296                     'title': video_title,
3297                     'uploader': clip.get('channel_name', video_uploader_id),
3298                     'uploader_id': video_uploader_id,
3299                     'upload_date': video_date,
3300                     'ext': video_extension,
3301                 })
3302         return (len(response), info)
3303
3304     def _real_extract(self, url):
3305         mobj = re.match(self._VALID_URL, url)
3306         if mobj is None:
3307             raise ExtractorError(u'invalid URL: %s' % url)
3308
3309         api_base = 'http://api.justin.tv'
3310         paged = False
3311         if mobj.group('channelid'):
3312             paged = True
3313             video_id = mobj.group('channelid')
3314             api = api_base + '/channel/archives/%s.json' % video_id
3315         elif mobj.group('chapterid'):
3316             chapter_id = mobj.group('chapterid')
3317
3318             webpage = self._download_webpage(url, chapter_id)
3319             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3320             if not m:
3321                 raise ExtractorError(u'Cannot find archive of a chapter')
3322             archive_id = m.group(1)
3323
3324             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3325             chapter_info_xml = self._download_webpage(api, chapter_id,
3326                                              note=u'Downloading chapter information',
3327                                              errnote=u'Chapter information download failed')
3328             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3329             for a in doc.findall('.//archive'):
3330                 if archive_id == a.find('./id').text:
3331                     break
3332             else:
3333                 raise ExtractorError(u'Could not find chapter in chapter information')
3334
3335             video_url = a.find('./video_file_url').text
3336             video_ext = video_url.rpartition('.')[2] or u'flv'
3337
3338             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3339             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3340                                    note='Downloading chapter metadata',
3341                                    errnote='Download of chapter metadata failed')
3342             chapter_info = json.loads(chapter_info_json)
3343
3344             bracket_start = int(doc.find('.//bracket_start').text)
3345             bracket_end = int(doc.find('.//bracket_end').text)
3346
3347             # TODO determine start (and probably fix up file)
3348             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3349             #video_url += u'?start=' + TODO:start_timestamp
3350             # bracket_start is 13290, but we want 51670615
3351             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3352                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3353
3354             info = {
3355                 'id': u'c' + chapter_id,
3356                 'url': video_url,
3357                 'ext': video_ext,
3358                 'title': chapter_info['title'],
3359                 'thumbnail': chapter_info['preview'],
3360                 'description': chapter_info['description'],
3361                 'uploader': chapter_info['channel']['display_name'],
3362                 'uploader_id': chapter_info['channel']['name'],
3363             }
3364             return [info]
3365         else:
3366             video_id = mobj.group('videoid')
3367             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3368
3369         self.report_extraction(video_id)
3370
3371         info = []
3372         offset = 0
3373         limit = self._JUSTIN_PAGE_LIMIT
3374         while True:
3375             if paged:
3376                 self.report_download_page(video_id, offset)
3377             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3378             page_count, page_info = self._parse_page(page_url, video_id)
3379             info.extend(page_info)
3380             if not paged or page_count != limit:
3381                 break
3382             offset += limit
3383         return info
3384
3385 class FunnyOrDieIE(InfoExtractor):
3386     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3387
3388     def _real_extract(self, url):
3389         mobj = re.match(self._VALID_URL, url)
3390         if mobj is None:
3391             raise ExtractorError(u'invalid URL: %s' % url)
3392
3393         video_id = mobj.group('id')
3394         webpage = self._download_webpage(url, video_id)
3395
3396         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3397             webpage, u'video URL', flags=re.DOTALL)
3398
3399         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3400             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3401
3402         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3403             webpage, u'description', fatal=False, flags=re.DOTALL)
3404
3405         info = {
3406             'id': video_id,
3407             'url': video_url,
3408             'ext': 'mp4',
3409             'title': title,
3410             'description': video_description,
3411         }
3412         return [info]
3413
3414 class SteamIE(InfoExtractor):
3415     _VALID_URL = r"""http://store\.steampowered\.com/
3416                 (agecheck/)?
3417                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3418                 (?P<gameID>\d+)/?
3419                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3420                 """
3421     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3422     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3423
3424     @classmethod
3425     def suitable(cls, url):
3426         """Receives a URL and returns True if suitable for this IE."""
3427         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3428
3429     def _real_extract(self, url):
3430         m = re.match(self._VALID_URL, url, re.VERBOSE)
3431         gameID = m.group('gameID')
3432
3433         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3434         webpage = self._download_webpage(videourl, gameID)
3435
3436         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3437             videourl = self._AGECHECK_TEMPLATE % gameID
3438             self.report_age_confirmation()
3439             webpage = self._download_webpage(videourl, gameID)
3440
3441         self.report_extraction(gameID)
3442         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3443                                              webpage, 'game title')
3444
3445         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3446         mweb = re.finditer(urlRE, webpage)
3447         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3448         titles = re.finditer(namesRE, webpage)
3449         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3450         thumbs = re.finditer(thumbsRE, webpage)
3451         videos = []
3452         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3453             video_id = vid.group('videoID')
3454             title = vtitle.group('videoName')
3455             video_url = vid.group('videoURL')
3456             video_thumb = thumb.group('thumbnail')
3457             if not video_url:
3458                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3459             info = {
3460                 'id':video_id,
3461                 'url':video_url,
3462                 'ext': 'flv',
3463                 'title': unescapeHTML(title),
3464                 'thumbnail': video_thumb
3465                   }
3466             videos.append(info)
3467         return [self.playlist_result(videos, gameID, game_title)]
3468
3469 class UstreamIE(InfoExtractor):
3470     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3471     IE_NAME = u'ustream'
3472
3473     def _real_extract(self, url):
3474         m = re.match(self._VALID_URL, url)
3475         video_id = m.group('videoID')
3476
3477         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3478         webpage = self._download_webpage(url, video_id)
3479
3480         self.report_extraction(video_id)
3481
3482         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3483             webpage, u'title')
3484
3485         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3486             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3487
3488         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3489             webpage, u'thumbnail', fatal=False)
3490
3491         info = {
3492                 'id': video_id,
3493                 'url': video_url,
3494                 'ext': 'flv',
3495                 'title': video_title,
3496                 'uploader': uploader,
3497                 'thumbnail': thumbnail,
3498                }
3499         return info
3500
3501 class WorldStarHipHopIE(InfoExtractor):
3502     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3503     IE_NAME = u'WorldStarHipHop'
3504
3505     def _real_extract(self, url):
3506         m = re.match(self._VALID_URL, url)
3507         video_id = m.group('id')
3508
3509         webpage_src = self._download_webpage(url, video_id)
3510
3511         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3512             webpage_src, u'video URL')
3513
3514         if 'mp4' in video_url:
3515             ext = 'mp4'
3516         else:
3517             ext = 'flv'
3518
3519         video_title = self._html_search_regex(r"<title>(.*)</title>",
3520             webpage_src, u'title')
3521
3522         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3523         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3524             webpage_src, u'thumbnail', fatal=False)
3525
3526         if not thumbnail:
3527             _title = r"""candytitles.*>(.*)</span>"""
3528             mobj = re.search(_title, webpage_src)
3529             if mobj is not None:
3530                 video_title = mobj.group(1)
3531
3532         results = [{
3533                     'id': video_id,
3534                     'url' : video_url,
3535                     'title' : video_title,
3536                     'thumbnail' : thumbnail,
3537                     'ext' : ext,
3538                     }]
3539         return results
3540
3541 class RBMARadioIE(InfoExtractor):
3542     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3543
3544     def _real_extract(self, url):
3545         m = re.match(self._VALID_URL, url)
3546         video_id = m.group('videoID')
3547
3548         webpage = self._download_webpage(url, video_id)
3549
3550         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3551             webpage, u'json data', flags=re.MULTILINE)
3552
3553         try:
3554             data = json.loads(json_data)
3555         except ValueError as e:
3556             raise ExtractorError(u'Invalid JSON: ' + str(e))
3557
3558         video_url = data['akamai_url'] + '&cbr=256'
3559         url_parts = compat_urllib_parse_urlparse(video_url)
3560         video_ext = url_parts.path.rpartition('.')[2]
3561         info = {
3562                 'id': video_id,
3563                 'url': video_url,
3564                 'ext': video_ext,
3565                 'title': data['title'],
3566                 'description': data.get('teaser_text'),
3567                 'location': data.get('country_of_origin'),
3568                 'uploader': data.get('host', {}).get('name'),
3569                 'uploader_id': data.get('host', {}).get('slug'),
3570                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3571                 'duration': data.get('duration'),
3572         }
3573         return [info]
3574
3575
3576 class YouPornIE(InfoExtractor):
3577     """Information extractor for youporn.com."""
3578     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3579
3580     def _print_formats(self, formats):
3581         """Print all available formats"""
3582         print(u'Available formats:')
3583         print(u'ext\t\tformat')
3584         print(u'---------------------------------')
3585         for format in formats:
3586             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3587
3588     def _specific(self, req_format, formats):
3589         for x in formats:
3590             if(x["format"]==req_format):
3591                 return x
3592         return None
3593
3594     def _real_extract(self, url):
3595         mobj = re.match(self._VALID_URL, url)
3596         if mobj is None:
3597             raise ExtractorError(u'Invalid URL: %s' % url)
3598         video_id = mobj.group('videoid')
3599
3600         req = compat_urllib_request.Request(url)
3601         req.add_header('Cookie', 'age_verified=1')
3602         webpage = self._download_webpage(req, video_id)
3603
3604         # Get JSON parameters
3605         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3606         try:
3607             params = json.loads(json_params)
3608         except:
3609             raise ExtractorError(u'Invalid JSON')
3610
3611         self.report_extraction(video_id)
3612         try:
3613             video_title = params['title']
3614             upload_date = unified_strdate(params['release_date_f'])
3615             video_description = params['description']
3616             video_uploader = params['submitted_by']
3617             thumbnail = params['thumbnails'][0]['image']
3618         except KeyError:
3619             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3620
3621         # Get all of the formats available
3622         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3623         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3624             webpage, u'download list').strip()
3625
3626         # Get all of the links from the page
3627         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3628         links = re.findall(LINK_RE, download_list_html)
3629         if(len(links) == 0):
3630             raise ExtractorError(u'ERROR: no known formats available for video')
3631
3632         self.to_screen(u'Links found: %d' % len(links))
3633
3634         formats = []
3635         for link in links:
3636
3637             # A link looks like this:
3638             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3639             # A path looks like this:
3640             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3641             video_url = unescapeHTML( link )
3642             path = compat_urllib_parse_urlparse( video_url ).path
3643             extension = os.path.splitext( path )[1][1:]
3644             format = path.split('/')[4].split('_')[:2]
3645             size = format[0]
3646             bitrate = format[1]
3647             format = "-".join( format )
3648             # title = u'%s-%s-%s' % (video_title, size, bitrate)
3649
3650             formats.append({
3651                 'id': video_id,
3652                 'url': video_url,
3653                 'uploader': video_uploader,
3654                 'upload_date': upload_date,
3655                 'title': video_title,
3656                 'ext': extension,
3657                 'format': format,
3658                 'thumbnail': thumbnail,
3659                 'description': video_description
3660             })
3661
3662         if self._downloader.params.get('listformats', None):
3663             self._print_formats(formats)
3664             return
3665
3666         req_format = self._downloader.params.get('format', None)
3667         self.to_screen(u'Format: %s' % req_format)
3668
3669         if req_format is None or req_format == 'best':
3670             return [formats[0]]
3671         elif req_format == 'worst':
3672             return [formats[-1]]
3673         elif req_format in ('-1', 'all'):
3674             return formats
3675         else:
3676             format = self._specific( req_format, formats )
3677             if result is None:
3678                 raise ExtractorError(u'Requested format not available')
3679             return [format]
3680
3681
3682
3683 class PornotubeIE(InfoExtractor):
3684     """Information extractor for pornotube.com."""
3685     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3686
3687     def _real_extract(self, url):
3688         mobj = re.match(self._VALID_URL, url)
3689         if mobj is None:
3690             raise ExtractorError(u'Invalid URL: %s' % url)
3691
3692         video_id = mobj.group('videoid')
3693         video_title = mobj.group('title')
3694
3695         # Get webpage content
3696         webpage = self._download_webpage(url, video_id)
3697
3698         # Get the video URL
3699         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3700         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3701         video_url = compat_urllib_parse.unquote(video_url)
3702
3703         #Get the uploaded date
3704         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3705         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3706         if upload_date: upload_date = unified_strdate(upload_date)
3707
3708         info = {'id': video_id,
3709                 'url': video_url,
3710                 'uploader': None,
3711                 'upload_date': upload_date,
3712                 'title': video_title,
3713                 'ext': 'flv',
3714                 'format': 'flv'}
3715
3716         return [info]
3717
3718 class YouJizzIE(InfoExtractor):
3719     """Information extractor for youjizz.com."""
3720     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3721
3722     def _real_extract(self, url):
3723         mobj = re.match(self._VALID_URL, url)
3724         if mobj is None:
3725             raise ExtractorError(u'Invalid URL: %s' % url)
3726
3727         video_id = mobj.group('videoid')
3728
3729         # Get webpage content
3730         webpage = self._download_webpage(url, video_id)
3731
3732         # Get the video title
3733         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3734             webpage, u'title').strip()
3735
3736         # Get the embed page
3737         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3738         if result is None:
3739             raise ExtractorError(u'ERROR: unable to extract embed page')
3740
3741         embed_page_url = result.group(0).strip()
3742         video_id = result.group('videoid')
3743
3744         webpage = self._download_webpage(embed_page_url, video_id)
3745
3746         # Get the video URL
3747         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3748             webpage, u'video URL')
3749
3750         info = {'id': video_id,
3751                 'url': video_url,
3752                 'title': video_title,
3753                 'ext': 'flv',
3754                 'format': 'flv',
3755                 'player_url': embed_page_url}
3756
3757         return [info]
3758
3759 class EightTracksIE(InfoExtractor):
3760     IE_NAME = '8tracks'
3761     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3762
3763     def _real_extract(self, url):
3764         mobj = re.match(self._VALID_URL, url)
3765         if mobj is None:
3766             raise ExtractorError(u'Invalid URL: %s' % url)
3767         playlist_id = mobj.group('id')
3768
3769         webpage = self._download_webpage(url, playlist_id)
3770
3771         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3772         data = json.loads(json_like)
3773
3774         session = str(random.randint(0, 1000000000))
3775         mix_id = data['id']
3776         track_count = data['tracks_count']
3777         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3778         next_url = first_url
3779         res = []
3780         for i in itertools.count():
3781             api_json = self._download_webpage(next_url, playlist_id,
3782                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3783                 errnote=u'Failed to download song information')
3784             api_data = json.loads(api_json)
3785             track_data = api_data[u'set']['track']
3786             info = {
3787                 'id': track_data['id'],
3788                 'url': track_data['track_file_stream_url'],
3789                 'title': track_data['performer'] + u' - ' + track_data['name'],
3790                 'raw_title': track_data['name'],
3791                 'uploader_id': data['user']['login'],
3792                 'ext': 'm4a',
3793             }
3794             res.append(info)
3795             if api_data['set']['at_last_track']:
3796                 break
3797             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3798         return res
3799
3800 class KeekIE(InfoExtractor):
3801     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3802     IE_NAME = u'keek'
3803
3804     def _real_extract(self, url):
3805         m = re.match(self._VALID_URL, url)
3806         video_id = m.group('videoID')
3807
3808         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3809         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3810         webpage = self._download_webpage(url, video_id)
3811
3812         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3813             webpage, u'title')
3814
3815         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3816             webpage, u'uploader', fatal=False)
3817
3818         info = {
3819                 'id': video_id,
3820                 'url': video_url,
3821                 'ext': 'mp4',
3822                 'title': video_title,
3823                 'thumbnail': thumbnail,
3824                 'uploader': uploader
3825         }
3826         return [info]
3827
3828 class TEDIE(InfoExtractor):
3829     _VALID_URL=r'''http://www\.ted\.com/
3830                    (
3831                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3832                         |
3833                         ((?P<type_talk>talks)) # We have a simple talk
3834                    )
3835                    (/lang/(.*?))? # The url may contain the language
3836                    /(?P<name>\w+) # Here goes the name and then ".html"
3837                    '''
3838
3839     @classmethod
3840     def suitable(cls, url):
3841         """Receives a URL and returns True if suitable for this IE."""
3842         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3843
3844     def _real_extract(self, url):
3845         m=re.match(self._VALID_URL, url, re.VERBOSE)
3846         if m.group('type_talk'):
3847             return [self._talk_info(url)]
3848         else :
3849             playlist_id=m.group('playlist_id')
3850             name=m.group('name')
3851             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3852             return [self._playlist_videos_info(url,name,playlist_id)]
3853
3854     def _playlist_videos_info(self,url,name,playlist_id=0):
3855         '''Returns the videos of the playlist'''
3856         video_RE=r'''
3857                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3858                      ([.\s]*?)data-playlist_item_id="(\d+)"
3859                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3860                      '''
3861         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3862         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3863         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3864         m_names=re.finditer(video_name_RE,webpage)
3865
3866         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3867                                                  webpage, 'playlist title')
3868
3869         playlist_entries = []
3870         for m_video, m_name in zip(m_videos,m_names):
3871             video_id=m_video.group('video_id')
3872             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3873             playlist_entries.append(self.url_result(talk_url, 'TED'))
3874         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3875
3876     def _talk_info(self, url, video_id=0):
3877         """Return the video for the talk in the url"""
3878         m = re.match(self._VALID_URL, url,re.VERBOSE)
3879         video_name = m.group('name')
3880         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3881         self.report_extraction(video_name)
3882         # If the url includes the language we get the title translated
3883         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3884                                         webpage, 'title')
3885         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3886                                     webpage, 'json data')
3887         info = json.loads(json_data)
3888         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3889                                        webpage, 'description', flags = re.DOTALL)
3890
3891         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3892                                        webpage, 'thumbnail')
3893         info = {
3894                 'id': info['id'],
3895                 'url': info['htmlStreams'][-1]['file'],
3896                 'ext': 'mp4',
3897                 'title': title,
3898                 'thumbnail': thumbnail,
3899                 'description': desc,
3900                 }
3901         return info
3902
3903 class MySpassIE(InfoExtractor):
3904     _VALID_URL = r'http://www.myspass.de/.*'
3905
3906     def _real_extract(self, url):
3907         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3908
3909         # video id is the last path element of the URL
3910         # usually there is a trailing slash, so also try the second but last
3911         url_path = compat_urllib_parse_urlparse(url).path
3912         url_parent_path, video_id = os.path.split(url_path)
3913         if not video_id:
3914             _, video_id = os.path.split(url_parent_path)
3915
3916         # get metadata
3917         metadata_url = META_DATA_URL_TEMPLATE % video_id
3918         metadata_text = self._download_webpage(metadata_url, video_id)
3919         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3920
3921         # extract values from metadata
3922         url_flv_el = metadata.find('url_flv')
3923         if url_flv_el is None:
3924             raise ExtractorError(u'Unable to extract download url')
3925         video_url = url_flv_el.text
3926         extension = os.path.splitext(video_url)[1][1:]
3927         title_el = metadata.find('title')
3928         if title_el is None:
3929             raise ExtractorError(u'Unable to extract title')
3930         title = title_el.text
3931         format_id_el = metadata.find('format_id')
3932         if format_id_el is None:
3933             format = ext
3934         else:
3935             format = format_id_el.text
3936         description_el = metadata.find('description')
3937         if description_el is not None:
3938             description = description_el.text
3939         else:
3940             description = None
3941         imagePreview_el = metadata.find('imagePreview')
3942         if imagePreview_el is not None:
3943             thumbnail = imagePreview_el.text
3944         else:
3945             thumbnail = None
3946         info = {
3947             'id': video_id,
3948             'url': video_url,
3949             'title': title,
3950             'ext': extension,
3951             'format': format,
3952             'thumbnail': thumbnail,
3953             'description': description
3954         }
3955         return [info]
3956
3957 class SpiegelIE(InfoExtractor):
3958     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3959
3960     def _real_extract(self, url):
3961         m = re.match(self._VALID_URL, url)
3962         video_id = m.group('videoID')
3963
3964         webpage = self._download_webpage(url, video_id)
3965
3966         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3967             webpage, u'title')
3968
3969         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3970         xml_code = self._download_webpage(xml_url, video_id,
3971                     note=u'Downloading XML', errnote=u'Failed to download XML')
3972
3973         idoc = xml.etree.ElementTree.fromstring(xml_code)
3974         last_type = idoc[-1]
3975         filename = last_type.findall('./filename')[0].text
3976         duration = float(last_type.findall('./duration')[0].text)
3977
3978         video_url = 'http://video2.spiegel.de/flash/' + filename
3979         video_ext = filename.rpartition('.')[2]
3980         info = {
3981             'id': video_id,
3982             'url': video_url,
3983             'ext': video_ext,
3984             'title': video_title,
3985             'duration': duration,
3986         }
3987         return [info]
3988
3989 class LiveLeakIE(InfoExtractor):
3990
3991     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3992     IE_NAME = u'liveleak'
3993
3994     def _real_extract(self, url):
3995         mobj = re.match(self._VALID_URL, url)
3996         if mobj is None:
3997             raise ExtractorError(u'Invalid URL: %s' % url)
3998
3999         video_id = mobj.group('video_id')
4000
4001         webpage = self._download_webpage(url, video_id)
4002
4003         video_url = self._search_regex(r'file: "(.*?)",',
4004             webpage, u'video URL')
4005
4006         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
4007             webpage, u'title').replace('LiveLeak.com -', '').strip()
4008
4009         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
4010             webpage, u'description', fatal=False)
4011
4012         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
4013             webpage, u'uploader', fatal=False)
4014
4015         info = {
4016             'id':  video_id,
4017             'url': video_url,
4018             'ext': 'mp4',
4019             'title': video_title,
4020             'description': video_description,
4021             'uploader': video_uploader
4022         }
4023
4024         return [info]
4025
4026 class ARDIE(InfoExtractor):
4027     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4028     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4029     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4030
4031     def _real_extract(self, url):
4032         # determine video id from url
4033         m = re.match(self._VALID_URL, url)
4034
4035         numid = re.search(r'documentId=([0-9]+)', url)
4036         if numid:
4037             video_id = numid.group(1)
4038         else:
4039             video_id = m.group('video_id')
4040
4041         # determine title and media streams from webpage
4042         html = self._download_webpage(url, video_id)
4043         title = re.search(self._TITLE, html).group('title')
4044         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4045         if not streams:
4046             assert '"fsk"' in html
4047             raise ExtractorError(u'This video is only available after 8:00 pm')
4048
4049         # choose default media type and highest quality for now
4050         stream = max([s for s in streams if int(s["media_type"]) == 0],
4051                      key=lambda s: int(s["quality"]))
4052
4053         # there's two possibilities: RTMP stream or HTTP download
4054         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4055         if stream['rtmp_url']:
4056             self.to_screen(u'RTMP download detected')
4057             assert stream['video_url'].startswith('mp4:')
4058             info["url"] = stream["rtmp_url"]
4059             info["play_path"] = stream['video_url']
4060         else:
4061             assert stream["video_url"].endswith('.mp4')
4062             info["url"] = stream["video_url"]
4063         return [info]
4064
4065 class ZDFIE(InfoExtractor):
4066     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4067     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4068     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4069     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4070     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4071
4072     def _real_extract(self, url):
4073         mobj = re.match(self._VALID_URL, url)
4074         if mobj is None:
4075             raise ExtractorError(u'Invalid URL: %s' % url)
4076         video_id = mobj.group('video_id')
4077
4078         html = self._download_webpage(url, video_id)
4079         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4080         if streams is None:
4081             raise ExtractorError(u'No media url found.')
4082
4083         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4084         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4085         # choose first/default media type and highest quality for now
4086         for s in streams:        #find 300 - dsl1000mbit
4087             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4088                 stream_=s
4089                 break
4090         for s in streams:        #find veryhigh - dsl2000mbit
4091             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4092                 stream_=s
4093                 break
4094         if stream_ is None:
4095             raise ExtractorError(u'No stream found.')
4096
4097         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4098
4099         self.report_extraction(video_id)
4100         mobj = re.search(self._TITLE, html)
4101         if mobj is None:
4102             raise ExtractorError(u'Cannot extract title')
4103         title = unescapeHTML(mobj.group('title'))
4104
4105         mobj = re.search(self._MMS_STREAM, media_link)
4106         if mobj is None:
4107             mobj = re.search(self._RTSP_STREAM, media_link)
4108             if mobj is None:
4109                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4110         mms_url = mobj.group('video_url')
4111
4112         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4113         if mobj is None:
4114             raise ExtractorError(u'Cannot extract extention')
4115         ext = mobj.group('ext')
4116
4117         return [{'id': video_id,
4118                  'url': mms_url,
4119                  'title': title,
4120                  'ext': ext
4121                  }]
4122
4123 class TumblrIE(InfoExtractor):
4124     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4125
4126     def _real_extract(self, url):
4127         m_url = re.match(self._VALID_URL, url)
4128         video_id = m_url.group('id')
4129         blog = m_url.group('blog_name')
4130
4131         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4132         webpage = self._download_webpage(url, video_id)
4133
4134         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4135         video = re.search(re_video, webpage)
4136         if video is None:
4137            raise ExtractorError(u'Unable to extract video')
4138         video_url = video.group('video_url')
4139         ext = video.group('ext')
4140
4141         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4142             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4143         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4144
4145         # The only place where you can get a title, it's not complete,
4146         # but searching in other places doesn't work for all videos
4147         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4148             webpage, u'title', flags=re.DOTALL)
4149
4150         return [{'id': video_id,
4151                  'url': video_url,
4152                  'title': video_title,
4153                  'thumbnail': video_thumbnail,
4154                  'ext': ext
4155                  }]
4156
4157 class BandcampIE(InfoExtractor):
4158     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4159
4160     def _real_extract(self, url):
4161         mobj = re.match(self._VALID_URL, url)
4162         title = mobj.group('title')
4163         webpage = self._download_webpage(url, title)
4164         # We get the link to the free download page
4165         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4166         if m_download is None:
4167             raise ExtractorError(u'No free songs found')
4168
4169         download_link = m_download.group(1)
4170         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4171                        webpage, re.MULTILINE|re.DOTALL).group('id')
4172
4173         download_webpage = self._download_webpage(download_link, id,
4174                                                   'Downloading free downloads page')
4175         # We get the dictionary of the track from some javascrip code
4176         info = re.search(r'items: (.*?),$',
4177                          download_webpage, re.MULTILINE).group(1)
4178         info = json.loads(info)[0]
4179         # We pick mp3-320 for now, until format selection can be easily implemented.
4180         mp3_info = info[u'downloads'][u'mp3-320']
4181         # If we try to use this url it says the link has expired
4182         initial_url = mp3_info[u'url']
4183         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4184         m_url = re.match(re_url, initial_url)
4185         #We build the url we will use to get the final track url
4186         # This url is build in Bandcamp in the script download_bunde_*.js
4187         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4188         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4189         # If we could correctly generate the .rand field the url would be
4190         #in the "download_url" key
4191         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4192
4193         track_info = {'id':id,
4194                       'title' : info[u'title'],
4195                       'ext' :   'mp3',
4196                       'url' :   final_url,
4197                       'thumbnail' : info[u'thumb_url'],
4198                       'uploader' :  info[u'artist']
4199                       }
4200
4201         return [track_info]
4202
4203 class RedTubeIE(InfoExtractor):
4204     """Information Extractor for redtube"""
4205     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4206
4207     def _real_extract(self,url):
4208         mobj = re.match(self._VALID_URL, url)
4209         if mobj is None:
4210             raise ExtractorError(u'Invalid URL: %s' % url)
4211
4212         video_id = mobj.group('id')
4213         video_extension = 'mp4'
4214         webpage = self._download_webpage(url, video_id)
4215
4216         self.report_extraction(video_id)
4217
4218         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4219             webpage, u'video URL')
4220
4221         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4222             webpage, u'title')
4223
4224         return [{
4225             'id':       video_id,
4226             'url':      video_url,
4227             'ext':      video_extension,
4228             'title':    video_title,
4229         }]
4230
4231 class InaIE(InfoExtractor):
4232     """Information Extractor for Ina.fr"""
4233     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4234
4235     def _real_extract(self,url):
4236         mobj = re.match(self._VALID_URL, url)
4237
4238         video_id = mobj.group('id')
4239         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4240         video_extension = 'mp4'
4241         webpage = self._download_webpage(mrss_url, video_id)
4242
4243         self.report_extraction(video_id)
4244
4245         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4246             webpage, u'video URL')
4247
4248         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4249             webpage, u'title')
4250
4251         return [{
4252             'id':       video_id,
4253             'url':      video_url,
4254             'ext':      video_extension,
4255             'title':    video_title,
4256         }]
4257
4258 class HowcastIE(InfoExtractor):
4259     """Information Extractor for Howcast.com"""
4260     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4261
4262     def _real_extract(self, url):
4263         mobj = re.match(self._VALID_URL, url)
4264
4265         video_id = mobj.group('id')
4266         webpage_url = 'http://www.howcast.com/videos/' + video_id
4267         webpage = self._download_webpage(webpage_url, video_id)
4268
4269         self.report_extraction(video_id)
4270
4271         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4272             webpage, u'video URL')
4273
4274         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4275             webpage, u'title')
4276
4277         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4278             webpage, u'description', fatal=False)
4279
4280         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4281             webpage, u'thumbnail', fatal=False)
4282
4283         return [{
4284             'id':       video_id,
4285             'url':      video_url,
4286             'ext':      'mp4',
4287             'title':    video_title,
4288             'description': video_description,
4289             'thumbnail': thumbnail,
4290         }]
4291
4292 class VineIE(InfoExtractor):
4293     """Information Extractor for Vine.co"""
4294     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4295
4296     def _real_extract(self, url):
4297         mobj = re.match(self._VALID_URL, url)
4298
4299         video_id = mobj.group('id')
4300         webpage_url = 'https://vine.co/v/' + video_id
4301         webpage = self._download_webpage(webpage_url, video_id)
4302
4303         self.report_extraction(video_id)
4304
4305         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4306             webpage, u'video URL')
4307
4308         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4309             webpage, u'title')
4310
4311         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4312             webpage, u'thumbnail', fatal=False)
4313
4314         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4315             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4316
4317         return [{
4318             'id':        video_id,
4319             'url':       video_url,
4320             'ext':       'mp4',
4321             'title':     video_title,
4322             'thumbnail': thumbnail,
4323             'uploader':  uploader,
4324         }]
4325
4326 class FlickrIE(InfoExtractor):
4327     """Information Extractor for Flickr videos"""
4328     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4329
4330     def _real_extract(self, url):
4331         mobj = re.match(self._VALID_URL, url)
4332
4333         video_id = mobj.group('id')
4334         video_uploader_id = mobj.group('uploader_id')
4335         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4336         webpage = self._download_webpage(webpage_url, video_id)
4337
4338         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4339
4340         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4341         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4342
4343         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4344             first_xml, u'node_id')
4345
4346         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4347         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4348
4349         self.report_extraction(video_id)
4350
4351         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4352         if mobj is None:
4353             raise ExtractorError(u'Unable to extract video url')
4354         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4355
4356         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4357             webpage, u'video title')
4358
4359         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4360             webpage, u'description', fatal=False)
4361
4362         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4363             webpage, u'thumbnail', fatal=False)
4364
4365         return [{
4366             'id':          video_id,
4367             'url':         video_url,
4368             'ext':         'mp4',
4369             'title':       video_title,
4370             'description': video_description,
4371             'thumbnail':   thumbnail,
4372             'uploader_id': video_uploader_id,
4373         }]
4374
4375 class TeamcocoIE(InfoExtractor):
4376     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4377
4378     def _real_extract(self, url):
4379         mobj = re.match(self._VALID_URL, url)
4380         if mobj is None:
4381             raise ExtractorError(u'Invalid URL: %s' % url)
4382         url_title = mobj.group('url_title')
4383         webpage = self._download_webpage(url, url_title)
4384
4385         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4386             webpage, u'video id')
4387
4388         self.report_extraction(video_id)
4389
4390         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4391             webpage, u'title')
4392
4393         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4394             webpage, u'thumbnail', fatal=False)
4395
4396         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4397             webpage, u'description', fatal=False)
4398
4399         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4400         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4401
4402         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4403             data, u'video URL')
4404
4405         return [{
4406             'id':          video_id,
4407             'url':         video_url,
4408             'ext':         'mp4',
4409             'title':       video_title,
4410             'thumbnail':   thumbnail,
4411             'description': video_description,
4412         }]
4413
4414 class XHamsterIE(InfoExtractor):
4415     """Information Extractor for xHamster"""
4416     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4417
4418     def _real_extract(self,url):
4419         mobj = re.match(self._VALID_URL, url)
4420
4421         video_id = mobj.group('id')
4422         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4423         webpage = self._download_webpage(mrss_url, video_id)
4424
4425         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4426         if mobj is None:
4427             raise ExtractorError(u'Unable to extract media URL')
4428         if len(mobj.group('server')) == 0:
4429             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4430         else:
4431             video_url = mobj.group('server')+'/key='+mobj.group('file')
4432         video_extension = video_url.split('.')[-1]
4433
4434         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4435             webpage, u'title')
4436
4437         # Can't see the description anywhere in the UI
4438         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4439         #     webpage, u'description', fatal=False)
4440         # if video_description: video_description = unescapeHTML(video_description)
4441
4442         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4443         if mobj:
4444             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4445         else:
4446             video_upload_date = None
4447             self._downloader.report_warning(u'Unable to extract upload date')
4448
4449         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4450             webpage, u'uploader id', default=u'anonymous')
4451
4452         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4453             webpage, u'thumbnail', fatal=False)
4454
4455         return [{
4456             'id':       video_id,
4457             'url':      video_url,
4458             'ext':      video_extension,
4459             'title':    video_title,
4460             # 'description': video_description,
4461             'upload_date': video_upload_date,
4462             'uploader_id': video_uploader_id,
4463             'thumbnail': video_thumbnail
4464         }]
4465
4466 class HypemIE(InfoExtractor):
4467     """Information Extractor for hypem"""
4468     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4469
4470     def _real_extract(self, url):
4471         mobj = re.match(self._VALID_URL, url)
4472         if mobj is None:
4473             raise ExtractorError(u'Invalid URL: %s' % url)
4474         track_id = mobj.group(1)
4475
4476         data = { 'ax': 1, 'ts': time.time() }
4477         data_encoded = compat_urllib_parse.urlencode(data)
4478         complete_url = url + "?" + data_encoded
4479         request = compat_urllib_request.Request(complete_url)
4480         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4481         cookie = urlh.headers.get('Set-Cookie', '')
4482
4483         self.report_extraction(track_id)
4484
4485         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4486             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4487         try:
4488             track_list = json.loads(html_tracks)
4489             track = track_list[u'tracks'][0]
4490         except ValueError:
4491             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4492
4493         key = track[u"key"]
4494         track_id = track[u"id"]
4495         artist = track[u"artist"]
4496         title = track[u"song"]
4497
4498         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4499         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4500         request.add_header('cookie', cookie)
4501         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4502         try:
4503             song_data = json.loads(song_data_json)
4504         except ValueError:
4505             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4506         final_url = song_data[u"url"]
4507
4508         return [{
4509             'id':       track_id,
4510             'url':      final_url,
4511             'ext':      "mp3",
4512             'title':    title,
4513             'artist':   artist,
4514         }]
4515
4516 class Vbox7IE(InfoExtractor):
4517     """Information Extractor for Vbox7"""
4518     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4519
4520     def _real_extract(self,url):
4521         mobj = re.match(self._VALID_URL, url)
4522         if mobj is None:
4523             raise ExtractorError(u'Invalid URL: %s' % url)
4524         video_id = mobj.group(1)
4525
4526         redirect_page, urlh = self._download_webpage_handle(url, video_id)
4527         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4528         redirect_url = urlh.geturl() + new_location
4529         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4530
4531         title = self._html_search_regex(r'<title>(.*)</title>',
4532             webpage, u'title').split('/')[0].strip()
4533
4534         ext = "flv"
4535         info_url = "http://vbox7.com/play/magare.do"
4536         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4537         info_request = compat_urllib_request.Request(info_url, data)
4538         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4539         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4540         if info_response is None:
4541             raise ExtractorError(u'Unable to extract the media url')
4542         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4543
4544         return [{
4545             'id':        video_id,
4546             'url':       final_url,
4547             'ext':       ext,
4548             'title':     title,
4549             'thumbnail': thumbnail_url,
4550         }]
4551
4552 class GametrailersIE(InfoExtractor):
4553     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4554
4555     def _real_extract(self, url):
4556         mobj = re.match(self._VALID_URL, url)
4557         if mobj is None:
4558             raise ExtractorError(u'Invalid URL: %s' % url)
4559         video_id = mobj.group('id')
4560         video_type = mobj.group('type')
4561         webpage = self._download_webpage(url, video_id)
4562         if video_type == 'full-episodes':
4563             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4564         else:
4565             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4566         mgid = self._search_regex(mgid_re, webpage, u'mgid')
4567         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4568
4569         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4570                                            video_id, u'Downloading video info')
4571         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4572                                                video_id, u'Downloading video urls info')
4573
4574         self.report_extraction(video_id)
4575         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4576                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4577                       <image>.*
4578                         <url>(?P<thumb>.*?)</url>.*
4579                       </image>'''
4580
4581         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4582         if m_info is None:
4583             raise ExtractorError(u'Unable to extract video info')
4584         video_title = m_info.group('title')
4585         video_description = m_info.group('description')
4586         video_thumb = m_info.group('thumb')
4587
4588         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4589         if m_urls is None or len(m_urls) == 0:
4590             raise ExtractError(u'Unable to extrat video url')
4591         # They are sorted from worst to best quality
4592         video_url = m_urls[-1].group('url')
4593
4594         return {'url':         video_url,
4595                 'id':          video_id,
4596                 'title':       video_title,
4597                 # Videos are actually flv not mp4
4598                 'ext':         'flv',
4599                 'thumbnail':   video_thumb,
4600                 'description': video_description,
4601                 }
4602
4603 class StatigramIE(InfoExtractor):
4604     _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)'
4605
4606     def _real_extract(self, url):
4607         mobj = re.match(self._VALID_URL, url)
4608
4609         video_id = mobj.group(1)
4610         webpage = self._download_webpage(url, video_id)
4611         video_url = self._html_search_regex(
4612             r'<meta property="og:video:secure_url" content="(.+?)">',
4613             webpage, u'video URL')
4614         thumbnail_url = self._html_search_regex(
4615             r'<meta property="og:image" content="(.+?)" />',
4616             webpage, u'thumbnail URL', fatal=False)
4617         html_title = self._html_search_regex(
4618             r'<title>(.+?)</title>',
4619             webpage, u'title')
4620         title = html_title.rpartition(u' | Statigram')[0]
4621         uploader_id = self._html_search_regex(
4622             r'@([^ ]+)', title, u'uploader name', fatal=False)
4623         ext = 'mp4'
4624
4625         return [{
4626             'id':        video_id,
4627             'url':       video_url,
4628             'ext':       ext,
4629             'title':     title,
4630             'thumbnail': thumbnail_url,
4631             'uploader_id' : uploader_id
4632         }]
4633
4634 def gen_extractors():
4635     """ Return a list of an instance of every supported extractor.
4636     The order does matter; the first extractor matched is the one handling the URL.
4637     """
4638     return [
4639         YoutubePlaylistIE(),
4640         YoutubeChannelIE(),
4641         YoutubeUserIE(),
4642         YoutubeSearchIE(),
4643         YoutubeIE(),
4644         MetacafeIE(),
4645         DailymotionIE(),
4646         GoogleSearchIE(),
4647         PhotobucketIE(),
4648         YahooIE(),
4649         YahooSearchIE(),
4650         DepositFilesIE(),
4651         FacebookIE(),
4652         BlipTVIE(),
4653         BlipTVUserIE(),
4654         VimeoIE(),
4655         MyVideoIE(),
4656         ComedyCentralIE(),
4657         EscapistIE(),
4658         CollegeHumorIE(),
4659         XVideosIE(),
4660         SoundcloudSetIE(),
4661         SoundcloudIE(),
4662         InfoQIE(),
4663         MixcloudIE(),
4664         StanfordOpenClassroomIE(),
4665         MTVIE(),
4666         YoukuIE(),
4667         XNXXIE(),
4668         YouJizzIE(),
4669         PornotubeIE(),
4670         YouPornIE(),
4671         GooglePlusIE(),
4672         ArteTvIE(),
4673         NBAIE(),
4674         WorldStarHipHopIE(),
4675         JustinTVIE(),
4676         FunnyOrDieIE(),
4677         SteamIE(),
4678         UstreamIE(),
4679         RBMARadioIE(),
4680         EightTracksIE(),
4681         KeekIE(),
4682         TEDIE(),
4683         MySpassIE(),
4684         SpiegelIE(),
4685         LiveLeakIE(),
4686         ARDIE(),
4687         ZDFIE(),
4688         TumblrIE(),
4689         BandcampIE(),
4690         RedTubeIE(),
4691         InaIE(),
4692         HowcastIE(),
4693         VineIE(),
4694         FlickrIE(),
4695         TeamcocoIE(),
4696         XHamsterIE(),
4697         HypemIE(),
4698         Vbox7IE(),
4699         GametrailersIE(),
4700         StatigramIE(),
4701         GenericIE()
4702     ]
4703
4704 def get_info_extractor(ie_name):
4705     """Returns the info extractor class with the given ie_name"""
4706     return globals()[ie_name+'IE']